diff --git a/Makefile b/Makefile
index e14c51ea..62d480a1 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ Q ?= @
 # CPU_ONLY := 1
 
 CXX ?= g++
-PYTHON ?= python
+PYTHON ?= python3.6
 
 EXTENSION_NAME := minkowski
 
@@ -66,17 +66,27 @@ ifneq ($(CPU_ONLY), 1)
 endif
 
 SRC_DIR := ./src
+SRC_GPU_COORDS_MAP_DIR := ./src/3rdparty/gpu_coords_map/include
+SRC_SLAB_HASH_DIR := ./src/3rdparty/gpu_coords_map/include/slab_hash
 OBJ_DIR := ./objs
 CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp)
+CPP_SRCS_GPU_COORDS_MAP := $(wildcard $(SRC_GPU_COORDS_MAP_DIR)/*.cpp)
+CPP_SRCS_SLAB_HASH:= $(wildcard $(SRC_SLAB_HASH_DIR)/*.cpp)
 CU_SRCS := $(wildcard $(SRC_DIR)/*.cu)
+CU_SRCS_GPU_COORDS_MAP := $(wildcard $(SRC_GPU_COORDS_MAP_DIR)/*.cu)
+CU_SRCS_SLAB_HASH:= $(wildcard $(SRC_SLAB_HASH_DIR)/*.cu)
 OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS))
+OBJS_GPU_COORDS_MAP := $(patsubst $(SRC_GPU_COORDS_MAP_DIR)/%.cpp,$(OBJ_DIR)/3rdparty/gpu_coords_map/include/%.o,$(CPP_SRCS_GPU_COORDS_MAP))
+OBJS_SLAB_HASH := $(patsubst $(SRC_SLAB_HASH_DIR)/%.cpp,$(OBJ_DIR)/3rdparty/gpu_coords_map/include/slab_hash/%.o,$(CPP_SRCS_SLAB_HASH))
 CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS))
+CU_OBJS_GPU_COORDS_MAP := $(patsubst $(SRC_GPU_COORDS_MAP_DIR)/%.cu,$(OBJ_DIR)/3rdparty/gpu_coords_map/include/cuda/%.o,$(CU_SRCS_GPU_COORDS_MAP))
+CU_OBJS_SLAB_HASH := $(patsubst $(SRC_SLAB_HASH_DIR)/%.cu,$(OBJ_DIR)/3rdparty/gpu_coords_map/include/slab_hash/cuda/%.o,$(CU_SRCS_SLAB_HASH))
 STATIC_LIB := $(OBJ_DIR)/lib$(EXTENSION_NAME).a
 
 # We will also explicitly add stdc++ to the link target.
 LIBRARIES := stdc++ c10 caffe2 torch torch_python _C
 ifneq ($(CPU_ONLY), 1)
-	LIBRARIES += cudart cublas cusparse caffe2_gpu c10_cuda
+	LIBRARIES += cudadevrt cudart cudadevrt cublas cudadevrt cusparse cudadevrt caffe2_gpu cudadevrt c10_cuda cudadevrt
 	CUDA_ARCH := -gencode arch=compute_30,code=sm_30 \
 			-gencode arch=compute_35,code=sm_35 \
 			-gencode=arch=compute_50,code=sm_50 \
@@ -118,6 +128,7 @@ ifeq ($(DEBUG), 1)
 	COMMON_FLAGS += -DDEBUG -g -O0
 	# https://gcoe-dresden.de/reaching-the-shore-with-a-fog-warning-my-eurohack-day-4-morning-session/
 	NVCCFLAGS := -g -G # -rdc true
+	# NVCCFLAGS := -g -G -rdc true
 else
 	COMMON_FLAGS += -DNDEBUG -O3
 endif
@@ -140,6 +151,7 @@ COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \
 
 CXXFLAGS += -fopenmp -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS)
 NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
+NVCCFLAGS += -rdc true
 LINKFLAGS += -pthread -fPIC $(WARNINGS) -Wl,-rpath=$(PYTHON_LIB_DIR) -Wl,--no-as-needed -Wl,--sysroot=/
 LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
 	   $(foreach library,$(LIBRARIES),-l$(library))
@@ -148,7 +160,7 @@ ifeq ($(CPU_ONLY), 1)
 	ALL_OBJS := $(OBJS)
 	CXXFLAGS += -DCPU_ONLY
 else
-	ALL_OBJS := $(OBJS) $(CU_OBJS)
+	ALL_OBJS := $(OBJS) $(OBJS_GPU_COORDS_MAP) $(OBJS_SLAB_HASH) $(CU_OBJS) $(CU_OBJS_GPU_COORDS_MAP) $(CU_OBJS_SLAB_HASH)
 endif
 
 all: $(STATIC_LIB)
@@ -157,8 +169,19 @@ all: $(STATIC_LIB)
 $(OBJ_DIR):
 	@ mkdir -p $@
 	@ mkdir -p $@/cuda
+	@ mkdir -p $@/3rdparty/gpu_coords_map/include/cuda
+	@ mkdir -p $@/3rdparty/gpu_coords_map/include/slab_hash/cuda
 
 $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR)
+	@ echo CXX $<
+	@ echo $(CXXFLAGS)
+	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@
+
+$(OBJ_DIR)/3rdparty/gpu_coords_map/include/%.o: $(SRC_GPU_COORDS_MAP_DIR)/%.cpp | $(OBJ_DIR)
+	@ echo CXX $<
+	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@
+
+$(OBJ_DIR)/3rdparty/gpu_coords_map/include/slab_hash/%.o: $(SRC_SLAB_HASH_DIR)/%.cpp | $(OBJ_DIR)
 	@ echo CXX $<
 	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@
 
@@ -168,8 +191,23 @@ $(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR)
 		-odir $(@D)
 	$(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
 
+$(OBJ_DIR)/3rdparty/gpu_coords_map/include/cuda/%.o: $(SRC_GPU_COORDS_MAP_DIR)/%.cu | $(OBJ_DIR)
+	@ echo NVCC $<
+	$(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
+		-odir $(@D)
+	$(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
+
+$(OBJ_DIR)/3rdparty/gpu_coords_map/include/slab_hash/cuda/%.o: $(SRC_SLAB_HASH_DIR)/%.cu| $(OBJ_DIR)
+	@ echo NVCC $<
+	$(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
+		-odir $(@D)
+	$(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
+
 $(STATIC_LIB): $(ALL_OBJS) | $(OBJ_DIR)
 	$(RM) -f $(STATIC_LIB)
+	@ echo $(LINKFLAGS)
+	@ echo $(LDFLAGS)
+	@ echo $(CXXFLAGS)
 	@ echo LD -o $@
 	ar rc $(STATIC_LIB) $(ALL_OBJS)
 
diff --git a/MinkowskiEngine/MinkowskiCoords.py b/MinkowskiEngine/MinkowskiCoords.py
index 38ff118c..4a258f9f 100644
--- a/MinkowskiEngine/MinkowskiCoords.py
+++ b/MinkowskiEngine/MinkowskiCoords.py
@@ -35,7 +35,8 @@
 if 'OMP_NUM_THREADS' in os.environ:
     CPU_COUNT = int(os.environ['OMP_NUM_THREADS'])
 
-_memory_manager_backend = MemoryManagerBackend.PYTORCH
+#_memory_manager_backend = MemoryManagerBackend.PYTORCH
+_memory_manager_backend = MemoryManagerBackend.CUDA
 
 
 def set_memory_manager_backend(backend: MemoryManagerBackend):
@@ -102,7 +103,8 @@ class CoordsManager():
     def __init__(self,
                  num_threads: int = -1,
                  memory_manager_backend: MemoryManagerBackend = None,
-                 D: int = -1):
+                 D: int = -1,
+                 device: str = 'cuda'):
         if D < 1:
             raise ValueError(f"Invalid dimension {D}")
         self.D = D
@@ -111,7 +113,9 @@ def __init__(self,
         if memory_manager_backend is None:
             global _memory_manager_backend
             memory_manager_backend = _memory_manager_backend
-        coords_man = MEB.CoordsManager(num_threads, memory_manager_backend)
+        coords_man = MEB.CoordsManager(num_threads, memory_manager_backend) \
+                     if device == 'cpu' else \
+                     MEB.GPUCoordsManager(D, 0, memory_manager_backend)
         self.CPPCoordsManager = coords_man
 
     def initialize(self,
@@ -120,14 +124,15 @@ def initialize(self,
                    force_creation: bool = False,
                    force_remap: bool = False,
                    allow_duplicate_coords: bool = False,
-                   return_inverse: bool = False) -> torch.LongTensor:
+                   return_inverse: bool = False) -> torch.IntTensor:
         assert isinstance(coords_key, CoordsKey)
-        unique_index = torch.LongTensor()
-        inverse_mapping = torch.LongTensor()
+        # TODO(ljm): Adjust cpu interface from long to int acoordingly
+        unique_index = torch.IntTensor()
+        inverse_mapping = torch.IntTensor()
         self.CPPCoordsManager.initializeCoords(
             coords, unique_index, inverse_mapping, coords_key.CPPCoordsKey,
             force_creation, force_remap, allow_duplicate_coords, return_inverse)
-        return unique_index, inverse_mapping
+        return unique_index.long(), inverse_mapping.long()
 
     def create_coords_key(self,
                           coords: torch.IntTensor,
@@ -171,6 +176,9 @@ def stride(self,
     def reduce(self):
         origin_key = CoordsKey(self.D)
         origin_key.setTensorStride(convert_to_int_list(0, self.D))
+        # TODO(ljm): Get batch_size by createOriginCoords
+        # TODO(ljm): Find a better way to get batch_size
+        # Notice(ljm): It can be concluded that the batch indices are contigous by GetCoordsAt
         origin_key.setKey(self.CPPCoordsManager.createOriginCoords(self.D))
         return origin_key
 
@@ -322,6 +330,9 @@ def get_kernel_map(self,
             is_transpose,
             is_pool)
 
+        kernel_map[0] = kernel_map[0].long()
+        kernel_map[1] = kernel_map[1].long()
+
         return kernel_map
 
     def get_coords_map(self, in_key_or_tensor_strides,
diff --git a/MinkowskiEngine/SparseTensor.py b/MinkowskiEngine/SparseTensor.py
index a5e22fd7..6d285779 100644
--- a/MinkowskiEngine/SparseTensor.py
+++ b/MinkowskiEngine/SparseTensor.py
@@ -229,6 +229,7 @@ def __init__(
             of the current sparse tensor. By default, it is 1.
 
         """
+        print(coords)
         assert isinstance(feats,
                           torch.Tensor), "Features must be a torch.Tensor"
         assert feats.ndim == 2, f"The feature should be a matrix, The input feature is an order-{feats.ndim} tensor."
@@ -254,12 +255,14 @@ def __init__(
             assert isinstance(coords, torch.Tensor), \
                 "Coordinate must be of type torch.Tensor"
 
+            print(isinstance(coords, torch.IntTensor))
             if not isinstance(coords, torch.IntTensor):
                 warnings.warn(
                     'Coords implicitly converted to torch.IntTensor. ' +
                     'To remove this warning, use `.int()` to convert the ' +
                     'coords into an torch.IntTensor')
-                coords = torch.floor(coords).int()
+                print(isinstance(coords, torch.IntTensor))
+#                coords = torch.floor(coords).int()
 
             if coords.device.type != 'cpu':
                 warnings.warn(
@@ -283,7 +286,9 @@ def __init__(
                 if _global_coords_man is None:
                     _global_coords_man = CoordsManager(
                         memory_manager_backend=memory_manager_backend,
-                        D=coords.size(1) - 1)
+                        D=coords.size(1) - 1,
+                        device=coords.device.type if coords is not None else 'cuda')
+                        #  TODO(ljm): handle device when coords is None
                 coords_manager = _global_coords_man
             else:
                 assert coords is not None, "Initial coordinates must be given"
diff --git a/pybind/minkowski.cpp b/pybind/minkowski.cpp
index 9c2ec7b1..7442870d 100644
--- a/pybind/minkowski.cpp
+++ b/pybind/minkowski.cpp
@@ -31,11 +31,13 @@
 
 #include "extern.hpp"
 #include "src/common.hpp"
+#include "src/types.hpp"
 
 namespace py = pybind11;
 
 namespace mink = minkowski;
 
+/*
 template <typename MapType, typename Dtype>
 void instantiate_func(py::module &m, const std::string &dtypestr) {
   m.def((std::string("ConvolutionForwardCPU") + dtypestr).c_str(),
@@ -188,6 +190,153 @@ void instantiate_func(py::module &m, const std::string &dtypestr) {
         py::call_guard<py::gil_scoped_release>());
 #endif
 }
+*/
+
+template <typename MapType, typename Dtype>
+void instantiate_func(py::module &m, const std::string &dtypestr) {
+  m.def((std::string("ConvolutionForwardCPU") + dtypestr).c_str(),
+        &mink::ConvolutionForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("ConvolutionBackwardCPU") + dtypestr).c_str(),
+        &mink::ConvolutionBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("ConvolutionTransposeForwardCPU") + dtypestr).c_str(),
+        &mink::ConvolutionTransposeForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("ConvolutionTransposeBackwardCPU") + dtypestr).c_str(),
+        &mink::ConvolutionTransposeBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("AvgPoolingForwardCPU") + dtypestr).c_str(),
+        &mink::AvgPoolingForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("AvgPoolingBackwardCPU") + dtypestr).c_str(),
+        &mink::AvgPoolingBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("MaxPoolingForwardCPU") + dtypestr).c_str(),
+        &mink::MaxPoolingForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("MaxPoolingBackwardCPU") + dtypestr).c_str(),
+        &mink::MaxPoolingBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("PoolingTransposeForwardCPU") + dtypestr).c_str(),
+        &mink::PoolingTransposeForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("PoolingTransposeBackwardCPU") + dtypestr).c_str(),
+        &mink::PoolingTransposeBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("GlobalPoolingForwardCPU") + dtypestr).c_str(),
+        &mink::GlobalPoolingForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("GlobalPoolingBackwardCPU") + dtypestr).c_str(),
+        &mink::GlobalPoolingBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("GlobalMaxPoolingForwardCPU") + dtypestr).c_str(),
+        &mink::GlobalMaxPoolingForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("GlobalMaxPoolingBackwardCPU") + dtypestr).c_str(),
+        &mink::GlobalMaxPoolingBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("BroadcastForwardCPU") + dtypestr).c_str(),
+        &mink::BroadcastForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("BroadcastBackwardCPU") + dtypestr).c_str(),
+        &mink::BroadcastBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("PruningForwardCPU") + dtypestr).c_str(),
+        &mink::PruningForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("PruningBackwardCPU") + dtypestr).c_str(),
+        &mink::PruningBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("UnionForwardCPU") + dtypestr).c_str(),
+        &mink::UnionForwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("UnionBackwardCPU") + dtypestr).c_str(),
+        &mink::UnionBackwardCPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+}
+
+template <typename MapType, typename Dtype>
+void instantiate_func_gpu(py::module &m, const std::string &dtypestr) {
+  m.def((std::string("ConvolutionForwardGPU") + dtypestr).c_str(),
+        &mink::ConvolutionForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("ConvolutionBackwardGPU") + dtypestr).c_str(),
+        &mink::ConvolutionBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("ConvolutionTransposeForwardGPU") + dtypestr).c_str(),
+        &mink::ConvolutionTransposeForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("ConvolutionTransposeBackwardGPU") + dtypestr).c_str(),
+        &mink::ConvolutionTransposeBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("AvgPoolingForwardGPU") + dtypestr).c_str(),
+        &mink::AvgPoolingForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("AvgPoolingBackwardGPU") + dtypestr).c_str(),
+        &mink::AvgPoolingBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("MaxPoolingForwardGPU") + dtypestr).c_str(),
+        &mink::MaxPoolingForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("MaxPoolingBackwardGPU") + dtypestr).c_str(),
+        &mink::MaxPoolingBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("PoolingTransposeForwardGPU") + dtypestr).c_str(),
+        &mink::PoolingTransposeForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("PoolingTransposeBackwardGPU") + dtypestr).c_str(),
+        &mink::PoolingTransposeBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("GlobalPoolingForwardGPU") + dtypestr).c_str(),
+        &mink::GlobalPoolingForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("GlobalPoolingBackwardGPU") + dtypestr).c_str(),
+        &mink::GlobalPoolingBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("GlobalMaxPoolingForwardGPU") + dtypestr).c_str(),
+        &mink::GlobalMaxPoolingForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("GlobalMaxPoolingBackwardGPU") + dtypestr).c_str(),
+        &mink::GlobalMaxPoolingBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("BroadcastForwardGPU") + dtypestr).c_str(),
+        &mink::BroadcastForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("BroadcastBackwardGPU") + dtypestr).c_str(),
+        &mink::BroadcastBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("PruningForwardGPU") + dtypestr).c_str(),
+        &mink::PruningForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("PruningBackwardGPU") + dtypestr).c_str(),
+        &mink::PruningBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+
+  m.def((std::string("UnionForwardGPU") + dtypestr).c_str(),
+        &mink::UnionForwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+  m.def((std::string("UnionBackwardGPU") + dtypestr).c_str(),
+        &mink::UnionBackwardGPU<MapType, Dtype>,
+        py::call_guard<py::gil_scoped_release>());
+}
 
 template <typename MapType> void instantiate_coordsman(py::module &m) {
   std::string coords_name = std::string("CoordsManager");
@@ -199,9 +348,6 @@ template <typename MapType> void instantiate_coordsman(py::module &m) {
                mink::CoordsManager<MapType>::existsCoordsKey)
       .def("getCoordsKey", &mink::CoordsManager<MapType>::getCoordsKey)
       .def("getKernelMap", &mink::CoordsManager<MapType>::getKernelMap)
-#ifndef CPU_ONLY
-      .def("getKernelMapGPU", &mink::CoordsManager<MapType>::getKernelMapGPU)
-#endif
       .def("getCoordsMap", &mink::CoordsManager<MapType>::getCoordsMap)
       .def("getUnionMap", &mink::CoordsManager<MapType>::getUnionMap)
       .def("getCoordsSize",
@@ -235,12 +381,61 @@ template <typename MapType> void instantiate_coordsman(py::module &m) {
            [](const mink::CoordsManager<MapType> &a) { return a.toString(); });
 }
 
+template <typename MapType> void instantiate_coordsman_gpu(py::module &m) {
+  std::string coords_name = std::string("GPUCoordsManager");
+  py::class_<mink::GPUCoordsManager<MapType>>(m, coords_name.c_str())
+//      .def(py::init<int>())
+      .def(py::init<int, int, mink::MemoryManagerBackend>())
+      .def("existsCoordsKey",
+           (bool (mink::GPUCoordsManager<MapType>::*)(py::object) const) &
+               mink::GPUCoordsManager<MapType>::existsCoordsKey)
+      .def("getCoordsKey", &mink::GPUCoordsManager<MapType>::getCoordsKey)
+      .def("getKernelMap", &mink::GPUCoordsManager<MapType>::getKernelMap)
+      .def("getCoordsMap", &mink::GPUCoordsManager<MapType>::getCoordsMap)
+      .def("getUnionMap", &mink::GPUCoordsManager<MapType>::getUnionMap)
+      .def("getCoordsSize",
+           (int (mink::GPUCoordsManager<MapType>::*)(py::object) const) &
+               mink::GPUCoordsManager<MapType>::getCoordsSize)
+      .def("getCoords", &mink::GPUCoordsManager<MapType>::getCoords)
+      .def("getBatchSize", &mink::GPUCoordsManager<MapType>::getBatchSize)
+      .def("getBatchIndices", &mink::GPUCoordsManager<MapType>::getBatchIndices)
+      .def("getRowIndicesAtBatchIndex",
+           &mink::GPUCoordsManager<MapType>::getRowIndicesAtBatchIndex)
+      .def("getRowIndicesPerBatch",
+           &mink::GPUCoordsManager<MapType>::getRowIndicesPerBatch)
+      .def("setOriginCoordsKey",
+           &mink::GPUCoordsManager<MapType>::setOriginCoordsKey)
+      .def("initializeCoords",
+           (uint64_t(mink::GPUCoordsManager<MapType>::*)(
+               at::Tensor, at::Tensor, at::Tensor, py::object, const bool,
+               const bool, const bool, const bool)) &
+               mink::GPUCoordsManager<MapType>::initializeCoords,
+           py::call_guard<py::gil_scoped_release>())
+      .def("createStridedCoords",
+           &mink::GPUCoordsManager<MapType>::createStridedCoords)
+      .def("createTransposedStridedRegionCoords",
+           &mink::GPUCoordsManager<MapType>::createTransposedStridedRegionCoords)
+      .def("createPrunedCoords",
+           &mink::GPUCoordsManager<MapType>::createPruningCoords)
+      .def("createOriginCoords",
+           &mink::GPUCoordsManager<MapType>::createOriginCoords)
+//      .def("printDiagnostics", &mink::GPUCoordsManager<MapType>::printDiagnostics)
+      .def("__repr__",
+           [](const mink::GPUCoordsManager<MapType> &a) { return a.toString(); });
+}
+
 template <typename MapType> void instantiate(py::module &m) {
   instantiate_coordsman<MapType>(m);
   instantiate_func<MapType, float>(m, std::string("f"));
   instantiate_func<MapType, double>(m, std::string("d"));
 }
 
+template <typename MapType> void instantiate_gpu(py::module &m) {
+  instantiate_coordsman_gpu<MapType>(m);
+  instantiate_func_gpu<MapType, float>(m, std::string("f"));
+  instantiate_func_gpu<MapType, double>(m, std::string("d"));
+}
+
 template <typename MapType> void bind_native(py::module &m) {
   std::string name = std::string("CoordsKey");
   py::class_<mink::CoordsKey>(m, name.c_str())
@@ -256,6 +451,7 @@ template <typename MapType> void bind_native(py::module &m) {
       .def("__repr__", [](const mink::CoordsKey &a) { return a.toString(); });
 
   // Quantization
+  // TODO(ljm): quantize_np and quantize_th only support CPU currently.
   m.def("quantize_np", &mink::quantize_np<MapType>);
   m.def("quantize_th", &mink::quantize_th<MapType>);
   m.def("quantize_label_np", &mink::quantize_label_np);
@@ -271,4 +467,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
   bind_native<mink::CoordsToIndexMap>(m);
   instantiate<mink::CoordsToIndexMap>(m);
+#ifndef CPU_ONLY
+  instantiate_gpu<mink::CoordsToIndexMapGPU>(m);
+#endif
 }
diff --git a/setup.py b/setup.py
index 314e257e..dd1f1b78 100644
--- a/setup.py
+++ b/setup.py
@@ -106,11 +106,17 @@ def _argparse(pattern, argv, is_flag=True):
             return arr[0].split("=")[1], argv
 
 
+print("argv: ")
+print(argv)
 # For cpu only build
 CPU_ONLY, argv = _argparse("--cpu_only", argv)
 CPU_ONLY = CPU_ONLY or not torch.cuda.is_available()
 KEEP_OBJS, argv = _argparse("--keep_objs", argv)
 FORCE_CUDA, argv = _argparse("--force_cuda", argv)
+print("CPU_ONLY: ")
+print(CPU_ONLY)
+print("FORCE_CUDA: ")
+print(FORCE_CUDA)
 
 # args with return value
 CUDA_HOME, argv = _argparse("--cuda_home", argv, False)
@@ -125,7 +131,8 @@ def _argparse(pattern, argv, is_flag=True):
     "PYTHON=" + sys.executable,  # curr python
 ]
 
-extra_compile_args = ["-Wno-deprecated-declarations"]
+extra_compile_args = []
+#extra_compile_args = ["-Wno-deprecated-declarations"]
 extra_link_args = []
 libraries = ["minkowski"]
 
@@ -140,6 +147,7 @@ def _argparse(pattern, argv, is_flag=True):
 else:
     # system python installation
     libraries.append("cusparse")
+    libraries.append("cudadevrt")
 
 if not (CUDA_HOME is False):  # False when not set, str otherwise
     print(f"Using CUDA_HOME={CUDA_HOME}")
@@ -187,6 +195,20 @@ def _argparse(pattern, argv, is_flag=True):
 
 run_command(*compile_args)
 
+'''
+print("extra_compile_args: ")
+print(extra_compile_args)
+print("extra_link_args: ")
+print(extra_link_args)
+extra_compile_args = {
+        #'cxx': ['-DBATCH_FIRST=1',],
+        'cxx': ['-DBATCH_FIRST=1', '-MMD', '-MP', '-ffast-math', '-funsafe-math-optimizations', '-fno-math-errno', '-DBATCH_FIRST=1', '-fopenmp', '-fPIC', '-fwrapv', '-std=c++14', '-DNDEBUG', '-O3', '-DTORCH_API_INCLUDE_EXTENSION_H', '-DTORCH_EXTENSION_NAME=minkowski', '-D_GLIBCXX_USE_CXX11_ABI=0', '-Wall', '-Wcomment', '-Wno-sign-compare', '-Wno-deprecated-declarations',],
+                        'nvcc': ['-DBATCH_FIRST=1', '-arch=sm_61', '-rdc=true', '--compiler-options', '-fPIC'],
+                        'nvcclink': ['-arch=sm_61', '--device-link', '--compiler-options', '-fPIC'],
+        }
+#extra_link_args = ['-pthread', '--device-link', '--compiler-options', '-fPIC', '-Wall', '-Wcomment', '-Wno-sign-compare', '-Wno-deprecated-declarations']
+extra_link_args = ['-pthread', '-fPIC', '-Wall', '-Wcomment', '-Wno-sign-compare', '-Wno-deprecated-declarations']
+'''
 # Python interface
 setup(
     name="MinkowskiEngine",
@@ -199,8 +221,9 @@ def _argparse(pattern, argv, is_flag=True):
             name="MinkowskiEngineBackend",
             include_dirs=[here, get_python_inc() + "/.."],
             library_dirs=["objs"],
-            sources=["pybind/minkowski.cpp",],
-            libraries=libraries,
+            sources=["pybind/minkowski.cpp",
+                ],
+            libraries=libraries + ["cudart", "cudadevrt"],
             extra_compile_args=extra_compile_args,
             extra_link_args=extra_link_args,
         )
@@ -243,3 +266,37 @@ def _argparse(pattern, argv, is_flag=True):
     ],
     python_requires=">=3.6",
 )
+
+'''
+                     "src/convolution.cpp",
+                     "src/math_functions.cpp",
+                     "src/coordsmap.cpp",
+                     "src/gpu_coordsmap.cpp",
+                     "src/pooling_max.cpp",
+                     "src/coords_key.cpp",
+                     "src/pooling_avg.cpp",
+                     "src/pooling_global_avg.cpp",
+                     "src/quantization.cpp",
+                     "src/pooling_global_max.cpp",
+                     "src/pruning.cpp",
+                     "src/3rdparty/gpu_coords_map/include/cuda_unordered_map.cpp",
+                     "src/broadcast.cpp",
+                     "src/coords_manager.cpp",
+                     "src/gpu_coords_manager.cpp",
+                     "src/region.cpp",
+                     "src/pooling_transpose.cpp",
+                     "src/convolution_transpose.cpp",
+                     "src/union.cpp",
+                     "src/pooling_avg.cu",
+                     "src/union.cu",
+                     "src/pooling_max.cu",
+                     "src/math_functions.cu",
+                     "src/pruning.cu",
+                     "src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.cu",
+                     "src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.cu",
+                     "src/3rdparty/gpu_coords_map/include/coordinate.cu",
+                     "src/broadcast.cu",
+                     "src/gpu.cu",
+                     "src/convolution.cu",
+                ],
+'''
diff --git a/src/3rdparty/gpu_coords_map/.clang-format b/src/3rdparty/gpu_coords_map/.clang-format
new file mode 100644
index 00000000..678d1f6b
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/.clang-format
@@ -0,0 +1,10 @@
+BasedOnStyle: Google
+IndentWidth: 4
+ColumnLimit: 80
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+ContinuationIndentWidth: 8
+AccessModifierOffset: -4
+BinPackParameters: false
+SortIncludes: true
diff --git a/src/3rdparty/gpu_coords_map/CMakeLists.txt b/src/3rdparty/gpu_coords_map/CMakeLists.txt
new file mode 100644
index 00000000..483b5640
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/CMakeLists.txt
@@ -0,0 +1,72 @@
+cmake_minimum_required (VERSION 3.5 FATAL_ERROR)
+project (SlabHash)
+
+find_package(CUDA 10.1 REQUIRED)
+
+option(CMAKE_VERBOSE_MAKEFILE ON)
+
+set(CUDA_NVCC_FLAGS -std=c++11)
+set (CMAKE_CXX_STANDARD 11)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+set(GENCODE_SM30
+  -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_30,code=compute_30)
+set(GENCODE_SM35
+  -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_35,code=compute_35)
+set(GENCODE_SM37
+  -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_37,code=compute_37)
+set(GENCODE_SM50
+  -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_50,code=compute_50)
+set(GENCODE_SM60
+  -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60)
+set(GENCODE_SM61
+  -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61)
+set(GENCODE_SM70
+  -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70)
+set(GENCODE_SM71
+  -gencode=arch=compute_71,code=sm_71 -gencode=arch=compute_71,code=compute_71)
+
+option(SLABHASH_GENCODE_SM30 "GENCODE_SM30" OFF)
+option(SLABHASH_GENCODE_SM35 "GENCODE_SM35" OFF)
+option(SLABHASH_GENCODE_SM37 "GENCODE_SM37" OFF)
+option(SLABHASH_GENCODE_SM50 "GENCODE_SM50" OFF)
+option(SLABHASH_GENCODE_SM60 "GENCODE_SM60" OFF)
+option(SLABHASH_GENCODE_SM61 "GENCODE_SM61" ON)
+option(SLABHASH_GENCODE_SM70 "GENCODE_SM70" OFF)
+option(SLABHASH_GENCODE_SM71 "GENCODE_SM71" OFF)
+
+if (SLABHASH_GENCODE_SM30)
+  set(GENCODE ${GENCODE} ${GENCODE_SM30})
+endif(SLABHASH_GENCODE_SM30)
+
+if (SLABHASH_GENCODE_SM35)
+  set(GENCODE ${GENCODE} ${GENCODE_SM35})
+endif(SLABHASH_GENCODE_SM35)
+
+if (SLABHASH_GENCODE_SM37)
+  set(GENCODE ${GENCODE} ${GENCODE_SM37})
+endif(SLABHASH_GENCODE_SM37)
+
+if (SLABHASH_GENCODE_SM50)
+  set(GENCODE ${GENCODE} ${GENCODE_SM50})
+endif(SLABHASH_GENCODE_SM50)
+
+if (SLABHASH_GENCODE_SM60)
+  set(GENCODE ${GENCODE} ${GENCODE_SM60})
+endif(SLABHASH_GENCODE_SM60)
+
+if (SLABHASH_GENCODE_SM61)
+  set(GENCODE ${GENCODE} ${GENCODE_SM61})
+endif(SLABHASH_GENCODE_SM61)
+
+if (SLABHASH_GENCODE_SM70)
+  set(GENCODE ${GENCODE} ${GENCODE_SM70})
+endif(SLABHASH_GENCODE_SM70)
+
+if(SLABHASH_GENCODE_SM71)
+  set(GENCODE ${GENCODE} ${GENCODE_SM71})
+endif(SLABHASH_GENCODE_SM71)
+
+include_directories(include)
+add_subdirectory(test)
diff --git a/src/3rdparty/gpu_coords_map/LICENSE b/src/3rdparty/gpu_coords_map/LICENSE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/src/3rdparty/gpu_coords_map/README.md b/src/3rdparty/gpu_coords_map/README.md
new file mode 100644
index 00000000..1742fe59
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/README.md
@@ -0,0 +1,72 @@
+# Multi-thread GPU CoordinateHash with Shared SLAB Router
+Multi-thread version of GPU CoordinateHash with shared slab router
+
+## What's news?
+- Light Head: the table head has been shrinked 32 times.
+- Singleton: reuse slab router by making SlabAlloc singleton.
+- Multi-thread: one slab router, multi table head.
+- Random Hash: one random number per table head which reduces the
+   confliction.
+
+## Usage example:
+more details in test_unique_with_remove_multithread.cu
+
+```
+int main() {
+  std::vector<std::thread> vt;
+  vt.reserve(50);
+  for (int i = 0; i != 50; ++i) {
+      vt.emplace_back(std::thread([i] {
+          TEST_6DIM_KEYS_THRUST(1000000);
+          std::cout << "Finish " << i << "th TEST_6DIM_KEYS_THRUST" << std::endl;
+      }));
+  }
+
+  for (int i = 0; i != 50; ++i) {
+      vt[i].join();
+  }
+}
+```
+
+##TODO
+
+
+1. General improvment:
+
+  - reduce memory Alloc and memory copy times.
+  - cuda memory pool for key value storage.
+
+2. Custom it to specific usage:
+
+  - custom kernel
+  - custom memory handling
+
+---------------------------------
+
+# GPU CoordinateHash
+This is a modified version of [SlabHash](https://github.com/owensgroup/SlabHash).
+
+Original SlabHash only supports <uint32_t, uint32_t> key value pairs.
+Now in theory it supports arbitrary value types, and multi dimensional <int, long, float, double> keys. It also supports self-defined hash function in template.
+
+At current only `Key<uint32_t, 7>, Value<uint32_t>` was tested.
+
+## Publication
+This library is based on the original slab hash paper, initially proposed in the following IPDPS'18 paper:
+* [Saman Ashkiani, Martin Farach-Colton, John Owens, *A Dynamic Hash Table for the GPU*, 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)](https://ieeexplore.ieee.org/abstract/document/8425196)
+
+This library is a rafactored and slightly redesigned version of the original code, so that it can be extended and be used in other research projects as well. It is still under continuous development. If you find any problem with the code, or suggestions for potential additions to the library, we will appreciate it if you can raise issues on github. We will address them as soon as possible. 
+
+## Compilation
+1. Make sure to edit `CMakeLists.txt` such that it reflects the GPU device's compute capability. For example, to include compute 3.5 you should have `option(SLABHASH_GENCODE_SM35 "GENCODE_SM35" ON)`.
+2. `mkdir build && cd build`
+3. `cmake ..`
+4. `make -j4`
+
+## Usage
+It is now a header only library. Include `coordinate_hash_map.cuh` or `coordinate_indexer.cuh` in your .cu file to use the lib. Documents TBD.
+
+## TODO
+- Update copyrights to be consistent with the original Apache license from [SlabHash](https://github.com/owensgroup/SlabHash).
+- Include parallel iterators.
+- Add pybind and improve `touch/allocate` function for voxel hashing.
diff --git a/src/3rdparty/gpu_coords_map/include/coordinate.cu b/src/3rdparty/gpu_coords_map/include/coordinate.cu
new file mode 100644
index 00000000..5ce27910
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/coordinate.cu
@@ -0,0 +1,35 @@
+#include "coordinate.h"
+
+///*
+template <typename T, size_t D>
+__device__ __host__ bool Coordinate<T, D>::operator==(const Coordinate<T, D>& rhs) const {
+    bool equal = true;
+#pragma unroll 1
+    for (size_t i = 0; i < D; ++i) {
+        equal = equal && (data_[i] == rhs[i]);
+    }
+    return equal;
+}
+//*/
+
+template <typename T, size_t D>
+struct CoordinateHashFunc {
+    __device__ __host__ uint64_t operator()(const Coordinate<T, D>& key) const {
+        uint64_t hash = UINT64_C(14695981039346656037);
+
+        /** We only support 4-byte and 8-byte types **/
+        using input_t = typename std::conditional<sizeof(T) == sizeof(uint32_t),
+                                                  uint32_t, uint64_t>::type;
+#pragma unroll 1
+        for (size_t i = 0; i < D; ++i) {
+            hash ^= *((input_t*)(&key[i]));
+            hash *= UINT64_C(1099511628211);
+        }
+        return hash;
+    }
+};
+
+template class Coordinate<int, 4>;
+template class Coordinate<int, 5>;
+template class Coordinate<int, 6>;
+template class Coordinate<int, 7>;
diff --git a/src/3rdparty/gpu_coords_map/include/coordinate.h b/src/3rdparty/gpu_coords_map/include/coordinate.h
new file mode 100644
index 00000000..dc661137
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/coordinate.h
@@ -0,0 +1,59 @@
+//
+// Created by dongw1 on 7/1/19.
+//
+
+#include <cstdint>
+#include <random>
+
+template <typename T, size_t D>
+struct Coordinate {
+private:
+    T data_[D];
+
+public:
+    __device__ __host__ T& operator[](size_t i) { return data_[i]; }
+    __device__ __host__ const T& operator[](size_t i) const { return data_[i]; }
+
+    __device__ __host__ bool operator==(const Coordinate<T, D>& rhs) const;
+    /*
+    __device__ __host__ bool operator==(const Coordinate<T, D>& rhs) const {
+        bool equal = true;
+#pragma unroll 1
+        for (size_t i = 0; i < D; ++i) {
+            equal = equal && (data_[i] == rhs[i]);
+        }
+        return equal;
+    }
+    */
+
+    static __host__ Coordinate<T, D> random(
+            std::default_random_engine generator,
+            std::uniform_int_distribution<int> dist) {
+        Coordinate<T, D> res;
+        for (size_t i = 0; i < D; ++i) {
+            res.data_[i] = dist(generator);
+        }
+        return res;
+    }
+};
+
+template <typename T, size_t D>
+struct CoordinateHashFunc;
+/*
+template <typename T, size_t D>
+struct CoordinateHashFunc {
+    __device__ __host__ uint64_t operator()(const Coordinate<T, D>& key) const {
+        uint64_t hash = UINT64_C(14695981039346656037);
+
+        // We only support 4-byte and 8-byte types
+        using input_t = typename std::conditional<sizeof(T) == sizeof(uint32_t),
+                                                  uint32_t, uint64_t>::type;
+#pragma unroll 1
+        for (size_t i = 0; i < D; ++i) {
+            hash ^= *((input_t*)(&key[i]));
+            hash *= UINT64_C(1099511628211);
+        }
+        return hash;
+    }
+};
+*/
diff --git a/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.cpp b/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.cpp
new file mode 100644
index 00000000..91782030
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.cpp
@@ -0,0 +1,294 @@
+#include "cuda_unordered_map.h"
+#include "coordinate.h"
+
+namespace cuda {
+//////////////
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::BulkInsert(const int* p_coords,
+                   int* p_mapping,
+                   int* p_inverse_mapping,
+                   int size,  int key_chunks_) {
+  assert(key_chunks_ == key_chunks);
+  slab_hash_->BulkInsertWithMapping(p_coords, p_mapping,
+                                    p_inverse_mapping, size);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateKeys(int* p_coords, int size) {
+  slab_hash_->IterateKeys(p_coords, size);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateSearchAtBatch(int* p_out, int batch_index, int size) {
+  slab_hash_->IterateSearchAtBatch(p_out, batch_index, size);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateSearchPerBatch(const std::vector<int*>& p_outs, int size) {
+  slab_hash_->IterateSearchPerBatch(p_outs, size);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateOffsetInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                         int* p_offset, int size) {
+  slab_hash_->IterateOffsetInsert(in_map->get_slab_hash(),
+                                  p_offset, size);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateOffsetInsertWithInsOuts(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                                    int* p_offset,
+                                    int* p_in, int* p_out,
+                                    int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateOffsetSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                                    int* p_offset,
+                                    int* p_in, int* p_out,
+                                    int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateBatchInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                            LOG_NUM_SUPER_BLOCKS,
+                                            LOG_NUM_MEM_BLOCKS,
+                                            Hash, Alloc>>& in_map,
+                                    int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateBatchSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                            LOG_NUM_SUPER_BLOCKS,
+                                            LOG_NUM_MEM_BLOCKS,
+                                            Hash, Alloc>>& in_map,
+                                    int* p_in, int* p_out,
+                                    int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateStrideInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                                    const std::vector<int>& tensor_strides,
+                                    int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateStrideInsertWithInOut(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                                    int* p_in, int* p_out,
+                                    const std::vector<int>& tensor_strides,
+                                    int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateStrideSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                                    int* p_in, int* p_out,
+                                    const std::vector<int>& tensor_strides,
+                                    int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                                             int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateInsertWithInsOuts(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                                    int* p_in, int* p_out,
+                                             int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IterateSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                             LOG_NUM_SUPER_BLOCKS,
+                                             LOG_NUM_MEM_BLOCKS,
+                                             Hash, Alloc>>& in_map,
+                                    int* p_in, int* p_out,
+                                             int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IteratePruneInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                                  LOG_NUM_SUPER_BLOCKS,
+                                                  LOG_NUM_MEM_BLOCKS,
+                                                  Hash, Alloc>>& in_map,
+                                bool* p_keep, int keep_size,
+                                int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IteratePruneInsertWithInOut(const std::shared_ptr<unordered_map<Key, Value,
+                                                  LOG_NUM_SUPER_BLOCKS,
+                                                  LOG_NUM_MEM_BLOCKS,
+                                                  Hash, Alloc>>& in_map,
+                                int* p_in, int* p_out,
+                                bool* p_keep, int keep_size,
+                                int size) {}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::
+IteratePruneSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                                  LOG_NUM_SUPER_BLOCKS,
+                                                  LOG_NUM_MEM_BLOCKS,
+                                                  Hash, Alloc>>& in_map,
+                                int* p_in, int* p_out,
+                                bool* p_keep, int keep_size,
+                                int size) {}
+//////////////
+
+template class unordered_map<Coordinate<int, 4>, int, 5, 5>;
+
+} // cuda
diff --git a/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.h b/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.h
new file mode 100644
index 00000000..d82fb7d2
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.h
@@ -0,0 +1,584 @@
+/*
+ * Copyright 2019 Saman Ashkiani,
+ * Modified 2019 by Wei Dong
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include "slab_hash/slab_hash.h"
+
+/*
+ * Default hash function:
+ * It treat any kind of input as a concatenation of ints.
+ */
+template <typename Key>
+struct hash {
+    __device__ __host__ uint64_t operator()(const Key& key) const {
+        uint64_t hash = UINT64_C(14695981039346656037);
+
+        const int chunks = sizeof(Key) / sizeof(int);
+        for (size_t i = 0; i < chunks; ++i) {
+            hash ^= ((int32_t*)(&key))[i];
+            hash *= UINT64_C(1099511628211);
+        }
+        return hash;
+    }
+};
+
+/* Lightweight wrapper to handle host input */
+/* Key supports elementary types: int, long, etc. */
+/* Value supports arbitrary types in theory. */
+/* std::vector<bool> is specialized: it stores only one bit per element
+ * We have to use uint8_t instead to read and write masks
+ * https://en.wikipedia.org/w/index.php?title=Sequence_container_(C%2B%2B)&oldid=767869909#Specialization_for_bool
+ */
+namespace cuda {
+using slab_hash::SlabHash;
+using slab_hash::CudaAllocator;
+
+template <typename Key,
+          typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS = 5,
+          uint32_t LOG_NUM_MEM_BLOCKS = 5,
+          typename Hash = hash<Key>,
+          class Alloc = CudaAllocator>
+class unordered_map {
+public:
+    using key_type = Key;
+    using value_type = Value;
+public:
+//    static constexpr uint32_t LOG_NUM_MEM_BLOCKS = 5;
+//    static constexpr uint32_t LOG_NUM_SUPER_BLOCKS = 5;
+    static constexpr uint32_t key_chunks = sizeof(Key) / sizeof(uint32_t);
+    static constexpr uint32_t value_chunks = sizeof(Value) / sizeof(uint32_t);
+    static constexpr uint32_t MEM_UNIT_WARP_MULTIPLES = key_chunks + value_chunks;
+//      (sizeof(Key) + sizeof(Value)) / sizeof(uint32_t);
+public:
+    unordered_map() {}
+    unordered_map(uint32_t max_keys,
+                  /* Preset hash table params to estimate bucket num */
+                  float duplicate_factor =
+                      1.0 / pow(2, sizeof(Key) / sizeof(uint32_t) - 1),
+                  uint32_t keys_per_bucket = 31 * 2,
+                  /* CUDA device */
+                  const uint32_t device_idx = 0);
+    ~unordered_map();
+
+    void reserve(uint32_t max_keys,
+                  /* Preset hash table params to estimate bucket num */
+                  float duplicate_factor =
+                      1.0 / pow(2, sizeof(Key) / sizeof(uint32_t) - 1),
+                  uint32_t keys_per_bucket = 31 * 2,
+                  /* CUDA device */
+                  const uint32_t device_idx = 0);
+
+    /* Minimal output */
+    /* No output for Insert */
+    Value Size();
+    Value BulkBuild(const std::vector<Key>& input_keys);
+    Value BulkBuild(thrust::device_vector<Key>& input_keys);
+    Value BulkBuild(Key* input_keys, int num_keys);
+
+    /* Value and mask output for Search */
+    std::pair<thrust::device_vector<Value>, thrust::device_vector<uint8_t>>
+    Search(const std::vector<Key>& input_keys);
+    std::pair<thrust::device_vector<Value>, thrust::device_vector<uint8_t>>
+    Search(thrust::device_vector<Key>& input_keys);
+    std::pair<thrust::device_vector<Value>, thrust::device_vector<uint8_t>>
+    Search(Key* input_keys, int num_keys);
+
+    /* No output for Remove */
+    void Remove(const std::vector<Key>& input_keys);
+    void Remove(thrust::device_vector<Key>& input_keys);
+    void Remove(Key* input_keys, int num_keys);
+
+    std::pair<thrust::device_vector<_Iterator<Key, Value>>,
+              thrust::device_vector<uint8_t>>
+    Search_(thrust::device_vector<Key>& input_keys);
+
+    thrust::device_vector<uint8_t> Remove_(
+            thrust::device_vector<Key>& input_keys);
+
+    /* Assistance functions */
+    float ComputeLoadFactor();
+    std::vector<int> CountElemsPerBucket();
+    void CountElems(thrust::device_vector<int>& count);
+
+    //////////////
+    void BulkInsert(const int* p_coords,
+                       int* p_mapping,
+                       int* p_inverse_mapping,
+                       int size, int key_chunks_);
+    void IterateKeys(int* p_coords, int size);
+    void IterateSearchAtBatch(int* p_out, int batch_index, int size);
+    void IterateSearchPerBatch(const std::vector<int*>& p_outs, int size);
+    void IterateOffsetInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                             int* p_offset, int size);
+    void IterateOffsetInsertWithInsOuts(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                                        int* p_offset,
+                                        int* p_in, int* p_out,
+                                        int size);
+    void IterateOffsetSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                                        int* p_offset,
+                                        int* p_in, int* p_out,
+                                        int size);
+    void IterateBatchInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                                LOG_NUM_SUPER_BLOCKS,
+                                                LOG_NUM_MEM_BLOCKS,
+                                                Hash, Alloc>>& in_map,
+                                        int size);
+    void IterateBatchSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                                LOG_NUM_SUPER_BLOCKS,
+                                                LOG_NUM_MEM_BLOCKS,
+                                                Hash, Alloc>>& in_map,
+                                        int* p_in, int* p_out,
+                                        int size);
+    void IterateStrideInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                                        const std::vector<int>& tensor_strides,
+                                        int size);
+    void IterateStrideInsertWithInOut(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                                        int* p_in, int* p_out,
+                                        const std::vector<int>& tensor_strides,
+                                        int size);
+    void IterateStrideSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                                        int* p_in, int* p_out,
+                                        const std::vector<int>& tensor_strides,
+                                        int size);
+    void IterateInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                                                 int size);
+    void IterateInsertWithInsOuts(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                                        int* p_in, int* p_out,
+                                                 int size);
+    void IterateSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                                 LOG_NUM_SUPER_BLOCKS,
+                                                 LOG_NUM_MEM_BLOCKS,
+                                                 Hash, Alloc>>& in_map,
+                                        int* p_in, int* p_out,
+                                                 int size);
+    void IteratePruneInsert(const std::shared_ptr<unordered_map<Key, Value,
+                                                      LOG_NUM_SUPER_BLOCKS,
+                                                      LOG_NUM_MEM_BLOCKS,
+                                                      Hash, Alloc>>& in_map,
+                                    bool* p_keep, int keep_size,
+                                    int size);
+    void IteratePruneInsertWithInOut(const std::shared_ptr<unordered_map<Key, Value,
+                                                      LOG_NUM_SUPER_BLOCKS,
+                                                      LOG_NUM_MEM_BLOCKS,
+                                                      Hash, Alloc>>& in_map,
+                                    int* p_in, int* p_out,
+                                    bool* p_keep, int keep_size,
+                                    int size);
+    void IteratePruneSearch(const std::shared_ptr<unordered_map<Key, Value,
+                                                      LOG_NUM_SUPER_BLOCKS,
+                                                      LOG_NUM_MEM_BLOCKS,
+                                                      Hash, Alloc>>& in_map,
+                                    int* p_in, int* p_out,
+                                    bool* p_keep, int keep_size,
+                                    int size);
+    //////////////
+
+    const std::shared_ptr<SlabHash<Key, Value, Hash, Alloc,
+                             LOG_NUM_MEM_BLOCKS,
+                             LOG_NUM_SUPER_BLOCKS,
+                             MEM_UNIT_WARP_MULTIPLES>>&
+    get_slab_hash() const {
+      return slab_hash_;
+    }
+
+private:
+    uint32_t max_keys_;
+    uint32_t num_buckets_;
+    uint32_t cuda_device_idx_;
+
+    /* Buffer for input cpu data (e.g. from std::vector) */
+    Key* input_key_buffer_;
+    Key* output_key_buffer_;
+    Value* output_value_buffer_;
+    _Iterator<Key, Value>* output_iterator_buffer_;
+    uint8_t* output_mask_buffer_;
+
+    std::shared_ptr<SlabHash<Key, Value, Hash, Alloc,
+                             LOG_NUM_MEM_BLOCKS,
+                             LOG_NUM_SUPER_BLOCKS,
+                             MEM_UNIT_WARP_MULTIPLES>> slab_hash_;
+    std::shared_ptr<Alloc> allocator_;
+};
+
+/*
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::unordered_map(
+        uint32_t max_keys,
+        float duplicate_factor,
+        uint32_t keys_per_bucket,
+        const uint32_t device_idx) {
+    reserve(max_keys, duplicate_factor, keys_per_bucket, device_idx);
+}
+*/
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::unordered_map(
+        uint32_t max_keys,
+        float duplicate_factor,
+        uint32_t keys_per_bucket,
+        const uint32_t device_idx) {
+    max_keys_ = max_keys;
+    cuda_device_idx_ = device_idx;
+    /* Set bucket size */
+    uint32_t expected_unique_keys = max_keys * duplicate_factor;
+    num_buckets_ = (expected_unique_keys + keys_per_bucket - 1) / keys_per_bucket;
+
+    /* Set device */
+    int32_t cuda_device_count_ = 0;
+    CHECK_CUDA(cudaGetDeviceCount(&cuda_device_count_));
+    assert(cuda_device_idx_ < cuda_device_count_);
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    allocator_ = std::make_shared<Alloc>(cuda_device_idx_);
+
+    // allocating key, value arrays to buffer input and output:
+    input_key_buffer_ = allocator_->template allocate<Key>(max_keys_);
+    output_key_buffer_ = allocator_->template allocate<Key>(max_keys_);
+    output_value_buffer_ = allocator_->template allocate<Value>(max_keys_);
+    output_mask_buffer_ = allocator_->template allocate<uint8_t>(max_keys_);
+    output_iterator_buffer_ =
+            allocator_->template allocate<_Iterator<Key, Value>>(max_keys_);
+
+    // allocate an initialize the allocator:
+    slab_hash_ = std::make_shared<SlabHash<Key, Value, Hash, Alloc,
+                                           LOG_NUM_MEM_BLOCKS,
+                                           LOG_NUM_SUPER_BLOCKS,
+                                           MEM_UNIT_WARP_MULTIPLES>>(
+            num_buckets_, max_keys_, cuda_device_idx_);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::~unordered_map() {
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+
+    allocator_->template deallocate<Key>(input_key_buffer_);
+    allocator_->template deallocate<Key>(output_key_buffer_);
+    allocator_->template deallocate<Value>(output_value_buffer_);
+    allocator_->template deallocate<uint8_t>(output_mask_buffer_);
+    allocator_->template deallocate<_Iterator<Key, Value>>(
+            output_iterator_buffer_);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+Value unordered_map<Key, Value,
+                    LOG_NUM_SUPER_BLOCKS,
+                    LOG_NUM_MEM_BLOCKS,
+                    Hash, Alloc>::Size() {
+    auto elems_per_bucket = slab_hash_->CountElemsPerBucket();
+    int total_elems_stored = std::accumulate(elems_per_bucket.begin(),
+                                             elems_per_bucket.end(), 0);
+    return total_elems_stored;
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+Value unordered_map<Key, Value,
+                    LOG_NUM_SUPER_BLOCKS,
+                    LOG_NUM_MEM_BLOCKS,
+                    Hash, Alloc>::BulkBuild(
+        const std::vector<Key>& input_keys) {
+    assert(input_keys.size() <= max_keys_);
+
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    CHECK_CUDA(cudaMemcpy(input_key_buffer_, input_keys.data(),
+                          sizeof(Key) * input_keys.size(),
+                          cudaMemcpyHostToDevice));
+
+    slab_hash_->InsertAtomic(input_key_buffer_,
+                       input_keys.size());
+    return slab_hash_->Size();
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+Value unordered_map<Key, Value,
+                    LOG_NUM_SUPER_BLOCKS,
+                    LOG_NUM_MEM_BLOCKS,
+                    Hash, Alloc>::BulkBuild(
+        thrust::device_vector<Key>& input_keys) {
+    assert(input_keys.size() <= max_keys_);
+
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    slab_hash_->InsertAtomic(thrust::raw_pointer_cast(input_keys.data()),
+                             input_keys.size());
+    return slab_hash_->Size();
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+Value unordered_map<Key, Value,
+                    LOG_NUM_SUPER_BLOCKS,
+                    LOG_NUM_MEM_BLOCKS,
+                    Hash, Alloc>::BulkBuild(Key* input_keys,
+                                                    int num_keys) {
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    slab_hash_->InsertAtomic(input_keys, num_keys);
+    return slab_hash_->Size();
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+std::pair<thrust::device_vector<Value>, thrust::device_vector<uint8_t>>
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::Search(
+        const std::vector<Key>& input_keys) {
+    assert(input_keys.size() <= max_keys_);
+
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    CHECK_CUDA(cudaMemset(output_mask_buffer_, 0,
+                          sizeof(uint8_t) * input_keys.size()));
+
+    CHECK_CUDA(cudaMemcpy(input_key_buffer_, input_keys.data(),
+                          sizeof(Key) * input_keys.size(),
+
+                          cudaMemcpyHostToDevice));
+    slab_hash_->Search(input_key_buffer_, output_value_buffer_,
+                       output_mask_buffer_, input_keys.size());
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    thrust::device_vector<Value> output_values(
+            output_value_buffer_, output_value_buffer_ + input_keys.size());
+    thrust::device_vector<uint8_t> output_masks(
+            output_mask_buffer_, output_mask_buffer_ + input_keys.size());
+    return std::make_pair(output_values, output_masks);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+std::pair<thrust::device_vector<Value>, thrust::device_vector<uint8_t>>
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::Search(
+        thrust::device_vector<Key>& input_keys) {
+    assert(input_keys.size() <= max_keys_);
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    CHECK_CUDA(cudaMemset(output_mask_buffer_, 0,
+                          sizeof(uint8_t) * input_keys.size()));
+
+    slab_hash_->Search(thrust::raw_pointer_cast(input_keys.data()),
+                       output_value_buffer_, output_mask_buffer_,
+                       input_keys.size());
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    thrust::device_vector<Value> output_values(
+            output_value_buffer_, output_value_buffer_ + input_keys.size());
+    thrust::device_vector<uint8_t> output_masks(
+            output_mask_buffer_, output_mask_buffer_ + input_keys.size());
+    return std::make_pair(output_values, output_masks);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+std::pair<thrust::device_vector<Value>, thrust::device_vector<uint8_t>>
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::Search(Key* input_keys, int num_keys) {
+    assert(num_keys <= max_keys_);
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    CHECK_CUDA(cudaMemset(output_mask_buffer_, 0, sizeof(uint8_t) * num_keys));
+
+    slab_hash_->Search(input_keys, output_value_buffer_, output_mask_buffer_,
+                       num_keys);
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    thrust::device_vector<Value> output_values(output_value_buffer_,
+                                               output_value_buffer_ + num_keys);
+    thrust::device_vector<uint8_t> output_masks(output_mask_buffer_,
+                                                output_mask_buffer_ + num_keys);
+    return std::make_pair(output_values, output_masks);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void unordered_map<Key, Value,
+                   LOG_NUM_SUPER_BLOCKS,
+                   LOG_NUM_MEM_BLOCKS,
+                   Hash, Alloc>::Remove(
+        const std::vector<Key>& input_keys) {
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    CHECK_CUDA(cudaMemcpy(input_key_buffer_, input_keys.data(),
+                          sizeof(Key) * input_keys.size(),
+                          cudaMemcpyHostToDevice));
+    slab_hash_->Remove(input_key_buffer_, input_keys.size());
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void unordered_map<Key, Value,
+                   LOG_NUM_SUPER_BLOCKS,
+                   LOG_NUM_MEM_BLOCKS,
+                   Hash, Alloc>::Remove(
+        thrust::device_vector<Key>& input_keys) {
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+
+    slab_hash_->Remove(thrust::raw_pointer_cast(input_keys.data()),
+                       input_keys.size());
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void unordered_map<Key, Value,
+                   LOG_NUM_SUPER_BLOCKS,
+                   LOG_NUM_MEM_BLOCKS,
+                   Hash, Alloc>::Remove(Key* input_keys,
+                                                    int num_keys) {
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    slab_hash_->Remove(input_keys, num_keys);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+std::pair<thrust::device_vector<_Iterator<Key, Value>>,
+          thrust::device_vector<uint8_t>>
+unordered_map<Key, Value,
+              LOG_NUM_SUPER_BLOCKS,
+              LOG_NUM_MEM_BLOCKS,
+              Hash, Alloc>::Search_(
+        thrust::device_vector<Key>& input_keys) {
+    assert(input_keys.size() <= max_keys_);
+
+    CHECK_CUDA(cudaSetDevice(cuda_device_idx_));
+    CHECK_CUDA(cudaMemset(output_mask_buffer_, 0,
+                          sizeof(uint8_t) * input_keys.size()));
+
+    slab_hash_->Search_(thrust::raw_pointer_cast(input_keys.data()),
+                        output_iterator_buffer_, output_mask_buffer_,
+                        input_keys.size());
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    thrust::device_vector<_Iterator<Key, Value>> output_iterators(
+            output_iterator_buffer_,
+            output_iterator_buffer_ + input_keys.size());
+    thrust::device_vector<uint8_t> output_masks(
+            output_mask_buffer_, output_mask_buffer_ + input_keys.size());
+    return std::make_pair(output_iterators, output_masks);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+std::vector<int> unordered_map<Key, Value,
+                               LOG_NUM_SUPER_BLOCKS,
+                               LOG_NUM_MEM_BLOCKS,
+                               Hash, Alloc>::CountElemsPerBucket() {
+    return slab_hash_->CountElemsPerBucket();
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+void unordered_map<Key, Value,
+                   LOG_NUM_SUPER_BLOCKS,
+                   LOG_NUM_MEM_BLOCKS,
+                   Hash, Alloc>::CountElems(
+    thrust::device_vector<int>& count) {
+    std::vector<int> std_count(3);
+    thrust::copy(count.begin(), count.end(), std_count.begin());
+    printf("Before count: %d\t%d\t%d\n", std_count[0], std_count[1], std_count[2]);
+    slab_hash_->CountElems(thrust::raw_pointer_cast(count.data()));
+    thrust::copy(count.begin(), count.end(), std_count.begin());
+    printf("After count: %d\t%d\t%d\n", std_count[0], std_count[1], std_count[2]);
+    assert(std_count[0] == 0);
+    assert(std_count[1] == 0);
+    assert(std_count[2] == 0);
+}
+
+template <typename Key, typename Value,
+          uint32_t LOG_NUM_SUPER_BLOCKS,
+          uint32_t LOG_NUM_MEM_BLOCKS,
+          typename Hash, class Alloc>
+float unordered_map<Key, Value,
+                    LOG_NUM_SUPER_BLOCKS,
+                    LOG_NUM_MEM_BLOCKS,
+                    Hash, Alloc>::ComputeLoadFactor() {
+    return slab_hash_->ComputeLoadFactor();
+}
+}  // namespace cuda
diff --git a/src/3rdparty/gpu_coords_map/include/helper_cuda.h b/src/3rdparty/gpu_coords_map/include/helper_cuda.h
new file mode 100644
index 00000000..68f460e1
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/helper_cuda.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2018 Saman Ashkiani
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdio.h>
+
+#define CHECK_CUDA(call)                                            \
+    do {                                                            \
+        cudaError_t err = call;                                     \
+        if (err != cudaSuccess) {                                   \
+            printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
+                   cudaGetErrorString(err));                        \
+            exit(EXIT_FAILURE);                                     \
+        }                                                           \
+    } while (0)
+
+class CudaTimer {
+public:
+    CudaTimer() {
+        CHECK_CUDA(cudaEventCreate(&start_));
+        CHECK_CUDA(cudaEventCreate(&stop_));
+    }
+    ~CudaTimer() {
+        CHECK_CUDA(cudaEventDestroy(start_));
+        CHECK_CUDA(cudaEventDestroy(stop_));
+    }
+
+    void Start() { CHECK_CUDA(cudaEventRecord(start_, 0)); }
+
+    float Stop() {
+        float time;
+        CHECK_CUDA(cudaEventRecord(stop_, 0));
+        CHECK_CUDA(cudaEventSynchronize(stop_));
+        CHECK_CUDA(cudaEventElapsedTime(&time, start_, stop_));
+        return time;
+    }
+
+private:
+    cudaEvent_t start_;
+    cudaEvent_t stop_;
+};
diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/allocator.h b/src/3rdparty/gpu_coords_map/include/slab_hash/allocator.h
new file mode 100644
index 00000000..dfc7bed3
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/slab_hash/allocator.h
@@ -0,0 +1,63 @@
+#include "../helper_cuda.h"
+#include "config.h"
+
+#pragma once
+
+namespace slab_hash {
+
+class Allocator {
+public:
+    Allocator(int device_id = 0) : device_id_(device_id) {}
+    template <typename T>
+    T* allocate(size_t size) {}
+
+    template <typename T>
+    void deallocate(T* ptr) {}
+
+protected:
+    int device_id_;
+};
+
+class CudaAllocator: public Allocator {
+public:
+    CudaAllocator(int device_id = 0) : Allocator(device_id) {}
+    template <typename T>
+    T* allocate(size_t size) {
+        T* ptr;
+        CHECK_CUDA(cudaMalloc((void**)&ptr, sizeof(T) * size));
+        return ptr;
+    }
+
+    template <typename T>
+    void deallocate(T* ptr) {
+        CHECK_CUDA(cudaFree(ptr));
+    }
+};
+
+/**
+class PyTorchAllocator: public Allocator {
+public:
+    PyTorchAllocator(int device_id = 0) : Allocator(device_id) {}
+
+    template <typename T>
+    T* allocate(size_t size)  {
+        CHECK_CUDA(cudaGetDevice(&device_id_));
+        auto options = torch::TensorOptions()
+                               .dtype(torch::kInt8)
+                               .device(torch::kCUDA, device_id_)
+                               .requires_grad(false);
+        tensor_ = torch::zeros(sizeof(T) * size, options);
+        return tensor_.data<T>()
+    }
+
+    template <typename T>
+    void deallocate(T* ptr)  {
+        // let PyTorch handle this
+    }
+
+protected:
+    torch::Tensor tensor_;
+};
+**/
+
+} // namespace slab_hash
diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/config.h b/src/3rdparty/gpu_coords_map/include/slab_hash/config.h
new file mode 100644
index 00000000..473d56cc
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/slab_hash/config.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2019 Saman Ashkiani
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+/** Built-in flags **/
+static constexpr uint32_t EMPTY_SLAB_PTR = 0xFFFFFFFF;
+static constexpr uint32_t EMPTY_PAIR_PTR = 0xFFFFFFFF;
+static constexpr uint32_t HEAD_SLAB_PTR = 0xFFFFFFFE;
+
+/** Queries **/
+static constexpr uint32_t SEARCH_NOT_FOUND = 0xFFFFFFFF;
+
+/** Warp operations **/
+static constexpr uint32_t WARP_WIDTH = 32;
+static constexpr uint32_t BLOCKSIZE_ = 128;
+
+/* bits:   31 | 30 | ... | 3 | 2 | 1 | 0 */
+static constexpr uint32_t ACTIVE_LANES_MASK = 0xFFFFFFFF;
+static constexpr uint32_t PAIR_PTR_LANES_MASK = 0x7FFFFFFF;
+static constexpr uint32_t NEXT_SLAB_PTR_LANE = 31;
+
+using addr_t = uint32_t;
+
+/* These types are all the same, but distiguish the naming can lead to clearer
+ * meanings*/
+using ptr_t = uint32_t;
+static constexpr uint32_t NULL_ITERATOR = 0xFFFFFFFF;
diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.cu b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.cu
new file mode 100644
index 00000000..0d53cecd
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.cu
@@ -0,0 +1,151 @@
+#include <vector>
+#include <thrust/device_vector.h>
+#include "slab_alloc.h"
+
+namespace slab_hash {
+
+template <class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+std::vector<int> SlabAlloc<_Alloc, _LOG_NUM_MEM_BLOCKS,
+                                   _LOG_NUM_SUPER_BLOCKS,
+                                   _MEM_UNIT_WARP_MULTIPLES>::
+    CountSlabsPerSuperblock() {
+        const uint32_t num_super_blocks = slab_alloc_context_.num_super_blocks_;
+
+        auto slabs_per_superblock_buffer =
+                allocator_->template allocate<uint32_t>(num_super_blocks);
+        thrust::device_vector<uint32_t> slabs_per_superblock(
+                slabs_per_superblock_buffer,
+                slabs_per_superblock_buffer + num_super_blocks);
+        thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
+                     0);
+
+        // counting total number of allocated memory units:
+        int blocksize = 128;
+        int num_mem_units =
+                slab_alloc_context_.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * 32;
+        int num_cuda_blocks = (num_mem_units + blocksize - 1) / blocksize;
+        CountSlabsPerSuperblockKernel<_LOG_NUM_MEM_BLOCKS,
+                                      _LOG_NUM_SUPER_BLOCKS,
+                                      _MEM_UNIT_WARP_MULTIPLES><<<num_cuda_blocks, blocksize>>>(
+                slab_alloc_context_,
+                thrust::raw_pointer_cast(slabs_per_superblock.data()));
+
+        std::vector<int> result(num_super_blocks);
+        thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
+                     result.begin());
+        allocator_->template deallocate<uint32_t>(slabs_per_superblock_buffer);
+        return std::move(result);
+    }
+template <uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+    __device__ uint32_t SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                                         _LOG_NUM_SUPER_BLOCKS,
+                                         _MEM_UNIT_WARP_MULTIPLES>::
+    WarpAllocate(const uint32_t& lane_id) {
+        // tries and allocate a new memory units within the resident memory
+        // block if it returns 0xFFFFFFFF, then there was not any empty memory
+        // unit a new resident block should be chosen, and repeat again
+        // allocated result:  _LOG_NUM_SUPER_BLOCKS  bits: super_block_index
+        //                    (22 - _LOG_NUM_SUPER_BLOCKS) bits: memory block index
+        //                    5  bits: memory unit index (hi-bits of 10bit)
+        //                    5  bits: memory unit index (lo-bits of 10bit)
+        int empty_lane = -1;
+        uint32_t free_lane;
+        uint32_t read_bitmap = resident_bitmap_;
+        uint32_t allocated_result = 0xFFFFFFFF;
+        // works as long as <31 bit are used in the allocated_result
+        // in other words, if there are 32 super blocks and at most 64k blocks
+        // per super block
+
+        while (allocated_result == 0xFFFFFFFF) {
+            empty_lane = __ffs(~resident_bitmap_) - 1;
+            free_lane = __ballot_sync(0xFFFFFFFF, empty_lane >= 0);
+            if (free_lane == 0) {
+                // all bitmaps are full: need to be rehashed again:
+                updateMemBlockIndex(((threadIdx.x + blockIdx.x * blockDim.x) >>
+                                    5) + hash_coef_);
+                read_bitmap = resident_bitmap_;
+                continue;
+            }
+            uint32_t src_lane = __ffs(free_lane) - 1;
+            if (src_lane == lane_id) {
+                read_bitmap = atomicCAS(
+                        super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_ +
+                                resident_index_ * BITMAP_SIZE_ + lane_id,
+                        resident_bitmap_, resident_bitmap_ | (1 << empty_lane));
+                if (read_bitmap == resident_bitmap_) {
+                    // successful attempt:
+                    resident_bitmap_ |= (1 << empty_lane);
+                    allocated_result =
+                            (super_block_index_
+                             << SUPER_BLOCK_BIT_OFFSET_ALLOC_) |
+                            (resident_index_ << MEM_BLOCK_BIT_OFFSET_ALLOC_) |
+                            (lane_id << MEM_UNIT_BIT_OFFSET_ALLOC_) |
+                            empty_lane;
+                } else {
+                    // Not successful: updating the current bitmap
+                    resident_bitmap_ = read_bitmap;
+                }
+            }
+            // asking for the allocated result;
+            allocated_result =
+                    __shfl_sync(0xFFFFFFFF, allocated_result, src_lane);
+        }
+        return allocated_result;
+    }
+
+    // called when the allocator fails to find an empty unit to allocate:
+template <uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+    __device__ void SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                                     _LOG_NUM_SUPER_BLOCKS,
+                                     _MEM_UNIT_WARP_MULTIPLES>::updateMemBlockIndex(uint32_t global_warp_id) {
+        num_attempts_++;
+        assert(num_attempts_ < 11);
+        super_block_index_++;
+        super_block_index_ = (super_block_index_ == num_super_blocks_)
+                                     ? 0
+                                     : super_block_index_;
+
+        resident_index_++;
+        resident_index_ = (resident_index_ == NUM_MEM_BLOCKS_PER_SUPER_BLOCK_)
+                                     ? 0
+                                     : resident_index_;
+
+        // loading the assigned memory block:
+        resident_bitmap_ =
+                *((super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_) +
+                  resident_index_ * BITMAP_SIZE_ + (threadIdx.x & 0x1F));
+    }
+
+template <uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void CountSlabsPerSuperblockKernel(SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                                                               _LOG_NUM_SUPER_BLOCKS,
+                                                               _MEM_UNIT_WARP_MULTIPLES> context,
+                                              uint32_t* slabs_per_superblock) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    int num_bitmaps = context.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * 32;
+    if (tid >= num_bitmaps) {
+        return;
+    }
+
+    for (int i = 0; i < context.num_super_blocks_; i++) {
+        uint32_t read_bitmap = *(context.get_ptr_for_bitmap(i, tid));
+        atomicAdd(&slabs_per_superblock[i], __popc(read_bitmap));
+    }
+}
+
+template class SlabAlloc<CudaAllocator, 5, 5, 5>;
+template class SlabAllocContext<5, 5, 5>;
+
+//template __device__ uint32_t SlabAllocContext<5, 5, 5>::WarpAllocate(const uint32_t& lane_id);
+
+} // namespace slab_hash
diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.h b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.h
new file mode 100644
index 00000000..aa543e01
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright 2018 Saman Ashkiani
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <iostream>
+#include <memory>
+#include <thrust/device_vector.h>
+#include "../helper_cuda.h"
+#include "allocator.h"
+#include "config.h"
+/*
+ * This class does not own any memory, and will be shallowly copied into device
+ * kernel
+ */
+
+namespace slab_hash {
+
+template <uint32_t _LOG_NUM_MEM_BLOCKS = 4,
+          uint32_t _LOG_NUM_SUPER_BLOCKS = 5,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES = 7>
+class SlabAllocContext {
+public:
+    static constexpr uint32_t NUM_SUPER_BLOCKS_ALLOCATOR_ =
+            (1 << _LOG_NUM_SUPER_BLOCKS);
+
+    // fixed parameters for the SlabAlloc
+    static constexpr uint32_t NUM_MEM_UNITS_PER_BLOCK_ = 1024;
+    static constexpr uint32_t NUM_BITMAP_PER_MEM_BLOCK_ = 32;
+    static constexpr uint32_t BITMAP_SIZE_ = 32;
+    static constexpr uint32_t WARP_SIZE = 32;
+    static constexpr uint32_t MEM_UNIT_SIZE_ =
+            _MEM_UNIT_WARP_MULTIPLES * (WARP_SIZE - 1) + 1;
+    static constexpr uint32_t SUPER_BLOCK_BIT_OFFSET_ALLOC_ =
+            32 - _LOG_NUM_SUPER_BLOCKS;
+    static constexpr uint32_t MEM_BLOCK_BIT_OFFSET_ALLOC_ = 10;
+    static constexpr uint32_t MEM_UNIT_BIT_OFFSET_ALLOC_ = 5;
+    static constexpr uint32_t NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ =
+            (1 << _LOG_NUM_MEM_BLOCKS);
+    static constexpr uint32_t MEM_BLOCK_SIZE_ =
+            NUM_MEM_UNITS_PER_BLOCK_ * MEM_UNIT_SIZE_;
+    static constexpr uint32_t SUPER_BLOCK_SIZE_ =
+            ((BITMAP_SIZE_ + MEM_BLOCK_SIZE_) *
+             NUM_MEM_BLOCKS_PER_SUPER_BLOCK_);
+    static constexpr uint32_t MEM_BLOCK_OFFSET_ =
+            (BITMAP_SIZE_ * NUM_MEM_BLOCKS_PER_SUPER_BLOCK_);
+    static constexpr uint32_t num_super_blocks_ = NUM_SUPER_BLOCKS_ALLOCATOR_;
+
+    static constexpr uint32_t MEM_BLOCKS_MASK_ = ((1 << _LOG_NUM_MEM_BLOCKS) - 1);
+    static constexpr uint32_t SUPER_BLOCKS_MASK_ = ((1 << _LOG_NUM_SUPER_BLOCKS) - 1);
+    static constexpr uint32_t MEM_BLOCK_MASK_ = ((1 << (SUPER_BLOCK_BIT_OFFSET_ALLOC_ -
+                                                     MEM_BLOCK_BIT_OFFSET_ALLOC_)) - 1);
+    static constexpr uint32_t MEM_UNIT_MASK_ = ((1 << MEM_BLOCK_BIT_OFFSET_ALLOC_) - 1);
+
+    SlabAllocContext()
+        : super_blocks_(nullptr),
+          hash_coef_(0),
+          num_attempts_(0),
+          resident_index_(0),
+          super_block_index_(0),
+          allocated_index_(0) {}
+
+    SlabAllocContext& operator=(const SlabAllocContext& rhs) {
+        super_blocks_ = rhs.super_blocks_;
+        hash_coef_ = rhs.hash_coef_;
+        num_attempts_ = 0;
+        resident_index_ = 0;
+        super_block_index_ = 0;
+        allocated_index_ = 0;
+        return *this;
+    }
+
+    ~SlabAllocContext() {}
+
+    void Setup(uint32_t* super_blocks) {
+        super_blocks_ = super_blocks;
+    }
+
+    __device__ __forceinline__ uint32_t* get_unit_ptr_from_slab(
+            const addr_t& next, const uint32_t& lane_id) {
+        return super_blocks_ + addressDecoder(next) + lane_id * _MEM_UNIT_WARP_MULTIPLES;
+    }
+    __device__ __forceinline__ uint32_t* get_ptr_for_bitmap(
+            const uint32_t super_block_index, const uint32_t bitmap_index) {
+        return super_blocks_ + super_block_index * SUPER_BLOCK_SIZE_ +
+               bitmap_index;
+    }
+
+    // Objective: each warp selects its own resident warp allocator:
+    __device__ void Init(uint32_t& hash_coef, uint32_t& tid, uint32_t& lane_id) {
+        // resident in register per thread
+        // call on before every insertion
+        hash_coef_ = hash_coef;
+        num_attempts_ = 0;
+        // hashing the memory block to be used:
+        createMemBlockIndex((tid >> 5) + hash_coef_);
+
+        // loading the assigned memory block:
+        resident_bitmap_ =
+                *(super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_ +
+                  resident_index_ * BITMAP_SIZE_ + lane_id);
+        allocated_index_ = 0xFFFFFFFF;
+    }
+
+    __device__ uint32_t WarpAllocate(const uint32_t& lane_id);
+    /*
+    __device__ uint32_t WarpAllocate(const uint32_t& lane_id) {
+        // tries and allocate a new memory units within the resident memory
+        // block if it returns 0xFFFFFFFF, then there was not any empty memory
+        // unit a new resident block should be chosen, and repeat again
+        // allocated result:  _LOG_NUM_SUPER_BLOCKS  bits: super_block_index
+        //                    (22 - _LOG_NUM_SUPER_BLOCKS) bits: memory block index
+        //                    5  bits: memory unit index (hi-bits of 10bit)
+        //                    5  bits: memory unit index (lo-bits of 10bit)
+        int empty_lane = -1;
+        uint32_t free_lane;
+        uint32_t read_bitmap = resident_bitmap_;
+        uint32_t allocated_result = 0xFFFFFFFF;
+        // works as long as <31 bit are used in the allocated_result
+        // in other words, if there are 32 super blocks and at most 64k blocks
+        // per super block
+
+        while (allocated_result == 0xFFFFFFFF) {
+            empty_lane = __ffs(~resident_bitmap_) - 1;
+            free_lane = __ballot_sync(0xFFFFFFFF, empty_lane >= 0);
+            if (free_lane == 0) {
+                // all bitmaps are full: need to be rehashed again:
+                updateMemBlockIndex(((threadIdx.x + blockIdx.x * blockDim.x) >>
+                                    5) + hash_coef_);
+                read_bitmap = resident_bitmap_;
+                continue;
+            }
+            uint32_t src_lane = __ffs(free_lane) - 1;
+            if (src_lane == lane_id) {
+                read_bitmap = atomicCAS(
+                        super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_ +
+                                resident_index_ * BITMAP_SIZE_ + lane_id,
+                        resident_bitmap_, resident_bitmap_ | (1 << empty_lane));
+                if (read_bitmap == resident_bitmap_) {
+                    // successful attempt:
+                    resident_bitmap_ |= (1 << empty_lane);
+                    allocated_result =
+                            (super_block_index_
+                             << SUPER_BLOCK_BIT_OFFSET_ALLOC_) |
+                            (resident_index_ << MEM_BLOCK_BIT_OFFSET_ALLOC_) |
+                            (lane_id << MEM_UNIT_BIT_OFFSET_ALLOC_) |
+                            empty_lane;
+                } else {
+                    // Not successful: updating the current bitmap
+                    resident_bitmap_ = read_bitmap;
+                }
+            }
+            // asking for the allocated result;
+            allocated_result =
+                    __shfl_sync(0xFFFFFFFF, allocated_result, src_lane);
+        }
+        return allocated_result;
+    }
+    */
+
+    // This function, frees a recently allocated memory unit by a single thread.
+    // Since it is untouched, there shouldn't be any worries for the actual
+    // memory contents to be reset again.
+    __device__ void FreeUntouched(addr_t ptr) {
+        atomicAnd(super_blocks_ + getSuperBlockIndex(ptr) * SUPER_BLOCK_SIZE_ +
+                          getMemBlockIndex(ptr) * BITMAP_SIZE_ +
+                          (getMemUnitIndex(ptr) >> 5),
+                  ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
+    }
+
+private:
+    // =========
+    // some helper inline address functions:
+    // =========
+    __device__ __host__ __forceinline__ uint32_t
+    getSuperBlockIndex(addr_t address) const {
+        return address >> SUPER_BLOCK_BIT_OFFSET_ALLOC_;
+    }
+    __device__ __host__ __forceinline__ uint32_t
+    getMemBlockIndex(addr_t address) const {
+        return (address >> MEM_BLOCK_BIT_OFFSET_ALLOC_) & MEM_BLOCK_MASK_;
+    }
+    __device__ __host__ __forceinline__ addr_t
+    getMemBlockAddress(addr_t address) const {
+        return (MEM_BLOCK_OFFSET_ +
+                getMemBlockIndex(address) * MEM_BLOCK_SIZE_);
+    }
+    __device__ __host__ __forceinline__ uint32_t
+    getMemUnitIndex(addr_t address) const {
+        return address & MEM_UNIT_MASK_;
+    }
+    __device__ __host__ __forceinline__ addr_t
+    getMemUnitAddress(addr_t address) {
+        return getMemUnitIndex(address) * MEM_UNIT_SIZE_;
+    }
+
+    // called at the beginning of the kernel:
+    __device__ void createMemBlockIndex(uint32_t global_warp_id) {
+        super_block_index_ = global_warp_id & SUPER_BLOCKS_MASK_;
+        resident_index_ =
+            (global_warp_id >> _LOG_NUM_SUPER_BLOCKS) &
+            MEM_BLOCKS_MASK_;
+    }
+
+    __device__ void updateMemBlockIndex(uint32_t global_warp_id);
+    /*
+    // called when the allocator fails to find an empty unit to allocate:
+    __device__ void updateMemBlockIndex(uint32_t global_warp_id) {
+        num_attempts_++;
+        assert(num_attempts_ < 11);
+        super_block_index_++;
+        super_block_index_ = (super_block_index_ == num_super_blocks_)
+                                     ? 0
+                                     : super_block_index_;
+
+        resident_index_++;
+        resident_index_ = (resident_index_ == NUM_MEM_BLOCKS_PER_SUPER_BLOCK_)
+                                     ? 0
+                                     : resident_index_;
+
+        // loading the assigned memory block:
+        resident_bitmap_ =
+                *((super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_) +
+                  resident_index_ * BITMAP_SIZE_ + (threadIdx.x & 0x1F));
+    }
+    */
+
+    __host__ __device__ addr_t addressDecoder(addr_t address_ptr_index) {
+        return getSuperBlockIndex(address_ptr_index) * SUPER_BLOCK_SIZE_ +
+               getMemBlockAddress(address_ptr_index) +
+               getMemUnitAddress(address_ptr_index);
+    }
+
+    __host__ __device__ void print_address(addr_t address_ptr_index) {
+        printf("Super block Index: %d, Memory block index: %d, Memory unit "
+               "index: "
+               "%d\n",
+               getSuperBlockIndex(address_ptr_index),
+               getMemBlockIndex(address_ptr_index),
+               getMemUnitIndex(address_ptr_index));
+    }
+
+private:
+    // a pointer to each super-block
+    uint32_t* super_blocks_;
+
+    // hash_coef (register): used as (_LOG_NUM_SUPER_BLOCKS + _LOG_NUM_MEM_BLOCKS bits) for hashing
+    uint32_t hash_coef_;  // a random 32-bit
+
+    // resident_index: (register)
+    // should indicate what memory block and super block is currently resident
+    // (_LOG_NUM_MEM_BLOCKS bits       + _LOG_NUM_SUPER_BLOCKS  bits)
+    // (memory block  + super block)
+    uint32_t num_attempts_;
+    uint32_t resident_index_;
+    uint32_t resident_bitmap_;
+    uint32_t super_block_index_;
+    uint32_t allocated_index_;  // to be asked via shuffle after
+};
+
+template <uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void CountSlabsPerSuperblockKernel(SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                                                               _LOG_NUM_SUPER_BLOCKS,
+                                                               _MEM_UNIT_WARP_MULTIPLES> context,
+                                              uint32_t* slabs_per_superblock);
+
+/*
+ * This class owns the memory for the allocator on the device
+ */
+template <class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+class SlabAlloc {
+public:
+    using SlabAllocContextT = SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                                               _LOG_NUM_SUPER_BLOCKS,
+                                               _MEM_UNIT_WARP_MULTIPLES>;
+protected:
+    // a pointer to each super-block
+    uint32_t* super_blocks_;
+
+    // the context class is actually copied shallowly into GPU device
+    SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                     _LOG_NUM_SUPER_BLOCKS,
+                     _MEM_UNIT_WARP_MULTIPLES> slab_alloc_context_;
+//    SlabAllocContextT slab_alloc_context_;
+
+    std::shared_ptr<_Alloc> allocator_;
+//      _Alloc* allocator_;
+
+private:
+    SlabAlloc() : super_blocks_(nullptr) {
+
+        allocator_ = std::make_shared<_Alloc>();
+        // In the light version, we put num_super_blocks super blocks within
+        // a single array
+        super_blocks_ = allocator_->template allocate<uint32_t>(
+                slab_alloc_context_.SUPER_BLOCK_SIZE_ *
+                slab_alloc_context_.num_super_blocks_);
+
+        for (int i = 0; i < slab_alloc_context_.num_super_blocks_; i++) {
+            // setting bitmaps into zeros:
+            CHECK_CUDA(cudaMemset(
+                    super_blocks_ + i * slab_alloc_context_.SUPER_BLOCK_SIZE_,
+                    0x00,
+                    slab_alloc_context_.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ *
+                            slab_alloc_context_.BITMAP_SIZE_ *
+                            sizeof(uint32_t)));
+            // setting empty memory units into ones:
+            CHECK_CUDA(cudaMemset(
+                    super_blocks_ + i * slab_alloc_context_.SUPER_BLOCK_SIZE_ +
+                            (slab_alloc_context_
+                                     .NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ *
+                             slab_alloc_context_.BITMAP_SIZE_),
+                    0xFF,
+                    slab_alloc_context_.MEM_BLOCK_SIZE_ *
+                            slab_alloc_context_
+                                    .NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ *
+                            sizeof(uint32_t)));
+        }
+
+        // initializing the slab context:
+        slab_alloc_context_.Setup(super_blocks_);
+    }
+    ~SlabAlloc() {
+        allocator_->template deallocate<uint32_t>(super_blocks_);
+        std::cout << "~SlabAlloc" << std::endl;
+    }
+    // Disable copy and assignment construction
+    SlabAlloc(const SlabAlloc&);
+    SlabAlloc& operator=(const SlabAlloc&);
+
+public:
+    static SlabAlloc<_Alloc,
+                     _LOG_NUM_MEM_BLOCKS,
+                     _LOG_NUM_SUPER_BLOCKS,
+                     _MEM_UNIT_WARP_MULTIPLES>* getInstance() {
+        static SlabAlloc<_Alloc,
+                         _LOG_NUM_MEM_BLOCKS,
+                         _LOG_NUM_SUPER_BLOCKS,
+                         _MEM_UNIT_WARP_MULTIPLES> localInstance =
+                                        SlabAlloc<_Alloc,
+                                                  _LOG_NUM_MEM_BLOCKS,
+                                                  _LOG_NUM_SUPER_BLOCKS,
+                                                  _MEM_UNIT_WARP_MULTIPLES>();
+        return &localInstance;
+    }
+
+public:
+    SlabAllocContextT& getContext() { return slab_alloc_context_; }
+
+    std::vector<int> CountSlabsPerSuperblock();
+    /*
+    std::vector<int> CountSlabsPerSuperblock() {
+        const uint32_t num_super_blocks = slab_alloc_context_.num_super_blocks_;
+
+        auto slabs_per_superblock_buffer =
+                allocator_->template allocate<uint32_t>(num_super_blocks);
+        thrust::device_vector<uint32_t> slabs_per_superblock(
+                slabs_per_superblock_buffer,
+                slabs_per_superblock_buffer + num_super_blocks);
+        thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
+                     0);
+
+        // counting total number of allocated memory units:
+        int blocksize = 128;
+        int num_mem_units =
+                slab_alloc_context_.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * 32;
+        int num_cuda_blocks = (num_mem_units + blocksize - 1) / blocksize;
+        CountSlabsPerSuperblockKernel<_LOG_NUM_MEM_BLOCKS,
+                                      _LOG_NUM_SUPER_BLOCKS,
+                                      _MEM_UNIT_WARP_MULTIPLES><<<num_cuda_blocks, blocksize>>>(
+                slab_alloc_context_,
+                thrust::raw_pointer_cast(slabs_per_superblock.data()));
+
+        std::vector<int> result(num_super_blocks);
+        thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
+                     result.begin());
+        allocator_->template deallocate<uint32_t>(slabs_per_superblock_buffer);
+        return std::move(result);
+    }
+    */
+};
+
+/*
+template <uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void CountSlabsPerSuperblockKernel(SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                                                               _LOG_NUM_SUPER_BLOCKS,
+                                                               _MEM_UNIT_WARP_MULTIPLES> context,
+                                              uint32_t* slabs_per_superblock) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    int num_bitmaps = context.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * 32;
+    if (tid >= num_bitmaps) {
+        return;
+    }
+
+    for (int i = 0; i < context.num_super_blocks_; i++) {
+        uint32_t read_bitmap = *(context.get_ptr_for_bitmap(i, tid));
+        atomicAdd(&slabs_per_superblock[i], __popc(read_bitmap));
+    }
+}
+*/
+
+} // namespace slab_hash
diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.cu b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.cu
new file mode 100644
index 00000000..1d5ca548
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.cu
@@ -0,0 +1,1598 @@
+#include <algorithm>
+#include "slab_hash.h"
+#include "../coordinate.h"
+#include "../cuda_unordered_map.h"
+
+namespace slab_hash {
+/**
+ * Implementation for the host class
+ **/
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+SlabHash<_Key, _Value, _Hash, _Alloc,
+         _LOG_NUM_MEM_BLOCKS,
+         _LOG_NUM_SUPER_BLOCKS,
+         _MEM_UNIT_WARP_MULTIPLES>::SlabHash(
+        const uint32_t max_bucket_count,
+        const uint32_t max_keyvalue_count,
+        uint32_t device_idx)
+    : num_buckets_(max_bucket_count),
+      device_idx_(device_idx),
+      bucket_list_head_(nullptr) {
+    int32_t device_count = 0;
+    CHECK_CUDA(cudaGetDeviceCount(&device_count));
+    assert(device_idx_ < device_count);
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+
+    // allocate an initialize the allocator:
+    allocator_ = std::make_shared<_Alloc>(device_idx);
+    slab_list_allocator_ = SlabAlloc<_Alloc,
+                                     _LOG_NUM_MEM_BLOCKS,
+                                     _LOG_NUM_SUPER_BLOCKS,
+                                     _MEM_UNIT_WARP_MULTIPLES>::getInstance();
+
+    assert(sizeof(_Value) % sizeof(ptr_t) == 0);
+    // allocating initial buckets:
+    bucket_list_head_ = allocator_->template allocate<ptr_t>(num_buckets_ +
+        sizeof(_Value) / sizeof(ptr_t));
+    cnt_value_ = reinterpret_cast<_Value*>(bucket_list_head_ + num_buckets_);
+    CHECK_CUDA(
+            cudaMemset(bucket_list_head_, 0xFF, sizeof(ptr_t) * num_buckets_));
+    CHECK_CUDA(
+            cudaMemset(cnt_value_, 0x00, sizeof(_Value)));
+
+    gpu_context_.Setup(bucket_list_head_, num_buckets_,
+                       cnt_value_,
+                       slab_list_allocator_->getContext());
+
+    // random coefficients for allocator's hash function
+    std::mt19937 rng(time(0));
+    hash_coef_ = rng();
+
+    std::cout << "hash_coef_: " << hash_coef_ << std::endl;
+
+    const uint32_t num_threads = num_buckets_ * WARP_WIDTH;
+    const uint32_t num_blocks = (num_threads + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    InitKernel<_Key, _Value, _Hash><<<num_blocks, BLOCKSIZE_>>>(gpu_context_, num_threads, hash_coef_);
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+SlabHash<_Key, _Value, _Hash, _Alloc,
+         _LOG_NUM_MEM_BLOCKS,
+         _LOG_NUM_SUPER_BLOCKS,
+         _MEM_UNIT_WARP_MULTIPLES>::~SlabHash() {
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+
+    slab_list_allocator_->getContext() = gpu_context_.get_slab_alloc_ctx();
+    auto slabs_per_super_block = slab_list_allocator_->CountSlabsPerSuperblock();
+    int total_slabs_stored = std::accumulate(
+            slabs_per_super_block.begin(), slabs_per_super_block.end(), 0);
+
+    std::cout << "Before total_slabs_stored: " << total_slabs_stored << std::endl;
+
+    for (auto n : slabs_per_super_block) std::cout << n << '\t'; std::cout << std::endl;
+
+    auto elems_per_bucket = CountElemsPerBucket();
+    int total_elems_stored = std::accumulate(elems_per_bucket.begin(),
+                                             elems_per_bucket.end(), 0);
+
+    printf("Before total_elems_stored: %d\n", total_elems_stored);
+
+    const uint32_t num_threads = num_buckets_ * 32;
+    const uint32_t num_blocks = (num_threads + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    ReleaseKernel<_Key, _Value, _Hash><<<num_blocks, BLOCKSIZE_>>>(gpu_context_, num_threads);
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//    CHECK_CUDA(cudaGetLastError());
+
+    elems_per_bucket = CountElemsPerBucket();
+    total_elems_stored = std::accumulate(elems_per_bucket.begin(),
+                                             elems_per_bucket.end(), 0);
+
+    printf("After total_elems_stored: %d\n", total_elems_stored);
+
+    slabs_per_super_block = slab_list_allocator_->CountSlabsPerSuperblock();
+    total_slabs_stored = std::accumulate(
+            slabs_per_super_block.begin(), slabs_per_super_block.end(), 0);
+
+    std::cout << "After total_slabs_stored: " << total_slabs_stored << std::endl;
+
+    for (auto n : slabs_per_super_block) {
+        std::cout << n << '\t';
+//        assert(n == 0);
+    }
+    std::cout << std::endl;
+
+    allocator_->template deallocate(bucket_list_head_);
+    std::cout << num_buckets_ << std::endl;
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+_Value SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::Size() {
+
+    _Value cnt_value;
+
+    CHECK_CUDA(cudaMemcpy(&cnt_value, cnt_value_,
+                          sizeof(_Value),
+                          cudaMemcpyDeviceToHost));
+
+    return cnt_value;
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+_Value* SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::SizePtr() {
+    return cnt_value_;
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::InsertAtomic(_Key* keys,
+                                                      uint32_t num_keys) {
+    const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    // calling the kernel for bulk build:
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+    InsertAtomicKernel<_Key, _Value, _Hash,
+                       _LOG_NUM_MEM_BLOCKS,
+                       _LOG_NUM_SUPER_BLOCKS,
+                       _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, BLOCKSIZE_>>>(gpu_context_, keys, num_keys, hash_coef_);
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//    CHECK_CUDA(cudaGetLastError());
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::Search(_Key* keys,
+                                                _Value* values,
+                                                uint8_t* founds,
+                                                uint32_t num_keys) {
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+    const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    SearchKernel<_Key, _Value, _Hash,
+                 _LOG_NUM_MEM_BLOCKS,
+                 _LOG_NUM_SUPER_BLOCKS,
+                 _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, BLOCKSIZE_>>>(
+            gpu_context_, keys, values, founds, num_keys);
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//    CHECK_CUDA(cudaGetLastError());
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::Remove(_Key* keys,
+                                                uint32_t num_keys) {
+    std::cout << "Enter Remove" << std::endl;
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+    const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    RemoveKernel<_Key, _Value, _Hash,
+                 _LOG_NUM_MEM_BLOCKS,
+                 _LOG_NUM_SUPER_BLOCKS,
+                 _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, BLOCKSIZE_>>>(gpu_context_, keys, num_keys);
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//    CHECK_CUDA(cudaGetLastError());
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::InsertAtomic_(
+        _Key* keys,
+        _Iterator<_Key, _Value>* iterators,
+        uint8_t* masks,
+        uint32_t num_keys) {
+    const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    // calling the kernel for bulk build:
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+    InsertAtomic_Kernel<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, BLOCKSIZE_>>>(
+            gpu_context_, keys, iterators, masks, num_keys, hash_coef_);
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//    CHECK_CUDA(cudaGetLastError());
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::Search_(_Key* keys,
+                                                 _Iterator<_Key, _Value>* iterators,
+                                                 uint8_t* masks,
+                                                 uint32_t num_keys) {
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+    const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    Search_Kernel<_Key, _Value, _Hash,
+                  _LOG_NUM_MEM_BLOCKS,
+                  _LOG_NUM_SUPER_BLOCKS,
+                  _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, BLOCKSIZE_>>>(
+            gpu_context_, keys, iterators, masks, num_keys);
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//    CHECK_CUDA(cudaGetLastError());
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::Remove_(_Key* keys,
+                                                 uint8_t* masks,
+                                                 uint32_t num_keys) {
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+    const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    Remove_Kernel<_Key, _Value, _Hash,
+                  _LOG_NUM_MEM_BLOCKS,
+                  _LOG_NUM_SUPER_BLOCKS,
+                  _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, BLOCKSIZE_>>>(gpu_context_, keys, masks, num_keys);
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//    CHECK_CUDA(cudaGetLastError());
+}
+
+/* Debug usage */
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+std::vector<int> SlabHash<_Key, _Value, _Hash, _Alloc,
+                          _LOG_NUM_MEM_BLOCKS,
+                          _LOG_NUM_SUPER_BLOCKS,
+                          _MEM_UNIT_WARP_MULTIPLES>::CountElemsPerBucket() {
+    std::cout << "num_buckets_: " << num_buckets_ << std::endl;
+
+    auto elems_per_bucket_buffer =
+            allocator_->template allocate<uint32_t>(num_buckets_);
+
+    thrust::device_vector<uint32_t> elems_per_bucket(
+            elems_per_bucket_buffer, elems_per_bucket_buffer + num_buckets_);
+    thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
+
+    const uint32_t blocksize = 128;
+    const uint32_t num_blocks = (num_buckets_ * 32 + blocksize - 1) / blocksize;
+    CountElemsPerBucketKernel<_Key, _Value, _Hash,
+                              _LOG_NUM_MEM_BLOCKS,
+                              _LOG_NUM_SUPER_BLOCKS,
+                              _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, blocksize>>>(
+            gpu_context_, thrust::raw_pointer_cast(elems_per_bucket.data()), hash_coef_);
+
+    std::vector<int> result(num_buckets_);
+    thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
+                 result.begin());
+    allocator_->template deallocate<uint32_t>(elems_per_bucket_buffer);
+    return std::move(result);
+}
+
+/* Debug usage */
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void SlabHash<_Key, _Value, _Hash, _Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>::CountElems(int* count) {
+
+    _Value cnt_value = Size();
+
+    auto values_buffer =
+            allocator_->template allocate<uint32_t>(cnt_value);
+
+    auto index_buffer =
+            allocator_->template allocate<uint32_t>(1);
+
+    thrust::device_vector<uint32_t> values(
+            values_buffer, values_buffer + cnt_value);
+
+    thrust::device_vector<uint32_t> index(
+            index_buffer, index_buffer + 1);
+    thrust::fill(index.begin(), index.end(), 0);
+
+    const uint32_t blocksize = 128;
+    const uint32_t num_blocks = (num_buckets_ * 32 + blocksize - 1) / blocksize;
+    std::cout << "Before CountElemsKernel" << std::endl;
+    CountElemsKernel<_Key, _Value, _Hash,
+                     _LOG_NUM_MEM_BLOCKS,
+                     _LOG_NUM_SUPER_BLOCKS,
+                     _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, blocksize>>>(
+                gpu_context_,
+                thrust::raw_pointer_cast(values.data()),
+                thrust::raw_pointer_cast(index.data()),
+                count
+            );
+    std::cout << "After CountElemsKernel" << std::endl;
+
+    std::vector<int> sorted_values(cnt_value);
+    std::vector<int> cnt(1);
+    thrust::copy(values.begin(), values.end(),
+                 sorted_values.begin());
+    thrust::copy(index.begin(), index.end(),
+                 cnt.begin());
+    allocator_->template deallocate<uint32_t>(values_buffer);
+    allocator_->template deallocate<uint32_t>(index_buffer);
+
+    std::cout << "Total Values: " << cnt[0] << std::endl;
+    std::sort(sorted_values.begin(), sorted_values.begin() + cnt[0]);
+    for (int i = 0; i != cnt[0]; ++i) {
+        if (i != sorted_values[i]) std::cout << i << '\t' << sorted_values[i] << std::endl;
+        assert(i == sorted_values[i]);
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+double SlabHash<_Key, _Value, _Hash, _Alloc,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::ComputeLoadFactor() {
+    auto elems_per_bucket = CountElemsPerBucket();
+    int total_elems_stored = std::accumulate(elems_per_bucket.begin(),
+                                             elems_per_bucket.end(), 0);
+
+    slab_list_allocator_->getContext() = gpu_context_.get_slab_alloc_ctx();
+    auto slabs_per_super_block = slab_list_allocator_->CountSlabsPerSuperblock();
+    int total_slabs_stored = std::accumulate(
+            slabs_per_super_block.begin(), slabs_per_super_block.end(), num_buckets_);
+
+    double load_factor = double(total_elems_stored) /
+                         double(total_slabs_stored * WARP_WIDTH);
+
+    return load_factor;
+}
+
+////////////////
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void
+SlabHash<_Key, _Value, _Hash, _Alloc,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::
+BulkInsertWithMapping(const int* p_coords,
+               int* p_mapping,
+               int* p_inverse_mapping,
+               int num_keys) {
+    const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+    // calling the kernel for bulk build:
+    CHECK_CUDA(cudaSetDevice(device_idx_));
+    BulkInsertWithMappingKernel<_Key, _Value, _Hash,
+                       _LOG_NUM_MEM_BLOCKS,
+                       _LOG_NUM_SUPER_BLOCKS,
+                       _MEM_UNIT_WARP_MULTIPLES>
+            <<<num_blocks, BLOCKSIZE_>>>(gpu_context_, p_coords,
+                                         p_mapping, p_inverse_mapping,
+                                         num_keys, hash_coef_);
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//    CHECK_CUDA(cudaGetLastError());
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void
+SlabHash<_Key, _Value, _Hash, _Alloc,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::
+IterateKeys(int* p_coords, int size) {
+}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void
+SlabHash<_Key, _Value, _Hash, _Alloc,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::
+IterateSearchAtBatch(int* p_out, int batch_index, int size) {}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void
+SlabHash<_Key, _Value, _Hash, _Alloc,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::
+IterateSearchPerBatch(const std::vector<int*>& p_outs, int size) {}
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+void
+SlabHash<_Key, _Value, _Hash, _Alloc,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::
+IterateOffsetInsert(const std::shared_ptr<SlabHash<_Key, _Value,
+                                         _Hash, _Alloc,
+                                         _LOG_NUM_MEM_BLOCKS,
+                                         _LOG_NUM_SUPER_BLOCKS,
+                                         _MEM_UNIT_WARP_MULTIPLES>>& in_map,
+                     int* p_offset, int size) {}
+////////////////
+
+/**
+ * Definitions
+ **/
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::SlabHashContext()
+    : num_buckets_(0), bucket_list_head_(nullptr) {
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void InitKernel(
+    SlabHashContext<_Key, _Value, _Hash,
+                    _LOG_NUM_MEM_BLOCKS,
+                    _LOG_NUM_SUPER_BLOCKS,
+                    _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+    const uint32_t num_threads,
+    uint32_t hash_coef) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid >= num_threads) return;
+    uint32_t lane_id = tid & 0x1F;
+    uint32_t bucket_id = (tid >> 5);
+
+
+    slab_hash_ctx.get_slab_alloc_ctx().Init(hash_coef, tid, lane_id);
+
+    ptr_t new_next_slab_ptr = slab_hash_ctx.get_slab_alloc_ctx().WarpAllocate(lane_id);
+
+    if (lane_id == NEXT_SLAB_PTR_LANE) {
+
+        const uint32_t* unit_data_ptr =
+                slab_hash_ctx.get_slab_ptr_from_list_head(bucket_id);
+
+// TODO(ljm): Ideal should be OK.
+            //Ideal:
+//            *((unsigned int*)unit_data_ptr) = new_next_slab_ptr;
+
+        ptr_t old_next_slab_ptr =
+                atomicCAS((unsigned int*)unit_data_ptr,
+                          EMPTY_SLAB_PTR, new_next_slab_ptr);
+
+        if (old_next_slab_ptr != EMPTY_SLAB_PTR) {
+            slab_hash_ctx.get_slab_alloc_ctx().FreeUntouched(new_next_slab_ptr);
+        }
+    }
+
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void ReleaseKernel(
+    SlabHashContext<_Key, _Value, _Hash,
+                    _LOG_NUM_MEM_BLOCKS,
+                    _LOG_NUM_SUPER_BLOCKS,
+                    _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+    const uint32_t num_threads) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid >= num_threads) return;
+    uint32_t lane_id = (tid & 0x1F);
+    uint32_t bucket_id = (tid >> 5);
+
+    ptr_t* curr_slab_ptr_ptr = NULL;
+    ptr_t curr_slab_ptr = EMPTY_SLAB_PTR;
+    if (lane_id == NEXT_SLAB_PTR_LANE) {
+        curr_slab_ptr_ptr =
+            slab_hash_ctx.get_slab_ptr_from_list_head(bucket_id);
+        curr_slab_ptr = *(curr_slab_ptr_ptr);
+    }
+    curr_slab_ptr =
+            __shfl_sync(ACTIVE_LANES_MASK, curr_slab_ptr,
+                      NEXT_SLAB_PTR_LANE, WARP_WIDTH);
+    while (curr_slab_ptr != EMPTY_SLAB_PTR) {
+
+        ptr_t* unit_data_ptr =
+            slab_hash_ctx.get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id);
+
+        if (lane_id != NEXT_SLAB_PTR_LANE) {
+            ptr_t old_first_key = atomicExch(unit_data_ptr, EMPTY_PAIR_PTR);
+            if (old_first_key != EMPTY_PAIR_PTR) {
+                slab_hash_ctx.ClearRemainPair(unit_data_ptr);
+            }
+        } else {
+            // set empty first
+            *(curr_slab_ptr_ptr) = EMPTY_SLAB_PTR;
+            // no need atomicExch ideally
+    //        atomicExch(curr_slab_ptr_ptr, EMPTY_PAIR_PTR);
+            // then, free untouched
+            slab_hash_ctx.get_slab_alloc_ctx().FreeUntouched(curr_slab_ptr);
+            curr_slab_ptr_ptr = unit_data_ptr;
+            curr_slab_ptr = *(curr_slab_ptr_ptr);
+        }
+        curr_slab_ptr =
+            __shfl_sync(ACTIVE_LANES_MASK, curr_slab_ptr,
+                      NEXT_SLAB_PTR_LANE, WARP_WIDTH);
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ __host__ __forceinline__ uint32_t
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::ComputeBucket(
+                    const _Key& key) const {
+    return hash_fn_(key) % num_buckets_;
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ __forceinline__ void
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::WarpSyncKey(
+                    const _Key& key,
+                    const uint32_t lane_id,
+                    _Key& ret) {
+#pragma unroll 1
+    for (size_t i = 0; i != key_chunks; ++i) {
+        ((int*)(&ret))[i] = __shfl_sync(ACTIVE_LANES_MASK, ((int*)(&key))[i],
+                                        lane_id, WARP_WIDTH);
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ __forceinline__ void
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::ClearRemainPair(ptr_t* ptr) {
+#pragma unroll 1
+    for (size_t i = 1;
+                i != key_chunks + value_chunks;
+                ++i) {
+        ptr[i] = EMPTY_SLAB_PTR;
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ __forceinline__ void
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::CopyRemainPair(
+                    ptr_t* ptr,
+                    const _Key& key,
+                    const _Value& value) {
+#pragma unroll 1
+    for (size_t i = 1; i < key_chunks; ++i) {
+        ((int*)(ptr))[i] = ((int*)(&key))[i];
+    }
+#pragma unroll 1
+    for (size_t i = 0; i < value_chunks; ++i) {
+        ((int*)(ptr))[key_chunks + i] = ((int*)(&value))[i];
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ int32_t
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::WarpFindKey(
+        const _Key& key, const uint32_t lane_id, const ptr_t* ptr) {
+    uint8_t is_lane_found =
+            /* select key lanes */
+            ((1 << lane_id) & PAIR_PTR_LANES_MASK)
+            && (reinterpret_cast<const _Pair<_Key, _Value>*>(ptr))->first == key;
+
+    return __ffs(__ballot_sync(PAIR_PTR_LANES_MASK, is_lane_found)) - 1;
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ __forceinline__ int32_t
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::WarpFindEmpty(const ptr_t* ptr) {
+    //assert(value_chunks == 1);
+    uint8_t is_lane_empty = (reinterpret_cast<const uint32_t*>(ptr)[_MEM_UNIT_WARP_MULTIPLES - 1] == EMPTY_PAIR_PTR);
+    return __ffs(__ballot_sync(PAIR_PTR_LANES_MASK, is_lane_empty)) - 1;
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ __forceinline__ ptr_t
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::AllocateSlab(
+                    const uint32_t lane_id) {
+    return slab_list_allocator_ctx_.WarpAllocate(lane_id);
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ __forceinline__ void
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::FreeSlab(
+        const ptr_t slab_ptr) {
+    slab_list_allocator_ctx_.FreeUntouched(slab_ptr);
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ _Pair<ptr_t*, uint8_t>
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::Search(
+        uint8_t& to_search,
+        const uint32_t lane_id,
+        const uint32_t bucket_id,
+        const _Key& query_key) {
+    uint32_t work_queue = 0;
+    uint32_t prev_work_queue = work_queue;
+    uint32_t curr_slab_ptr = EMPTY_SLAB_PTR;
+
+    ptr_t* iterator = NULL;
+    uint8_t mask = false;
+
+    /** > Loop when we have active lanes **/
+    while ((work_queue = __ballot_sync(ACTIVE_LANES_MASK, to_search))) {
+        /** 0. Restart from linked list head if the last query is finished
+         * **/
+        uint32_t src_lane = __ffs(work_queue) - 1;
+        uint32_t src_bucket =
+                __shfl_sync(ACTIVE_LANES_MASK, bucket_id, src_lane, WARP_WIDTH);
+
+        _Key src_key;
+        WarpSyncKey(query_key, src_lane, src_key);
+
+        curr_slab_ptr =
+                (prev_work_queue != work_queue)
+                        ? *(get_slab_ptr_from_list_head(src_bucket))
+                        : curr_slab_ptr;
+
+        /* Each lane in the warp reads a uint in the slab in parallel */
+        const uint32_t* unit_data_ptr =
+                        get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id);
+
+        int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data_ptr);
+
+        /** 1. Found in this slab, SUCCEED **/
+        if (lane_found >= 0) {
+            /* broadcast found value */
+            uint64_t found_pair_internal_ptr = __shfl_sync(
+                    ACTIVE_LANES_MASK, reinterpret_cast<uint64_t>(unit_data_ptr), lane_found, WARP_WIDTH);
+
+            if (lane_id == src_lane) {
+                to_search = false;
+
+                iterator = reinterpret_cast<ptr_t*>(found_pair_internal_ptr);
+                mask = true;
+            }
+        }
+
+        /** 2. Not found in this slab **/
+        else {
+            ptr_t unit_data = *(reinterpret_cast<const ptr_t*>(unit_data_ptr));
+            /* broadcast next slab: lane 31 reads 'next' */
+            ptr_t next_slab_ptr = __shfl_sync(ACTIVE_LANES_MASK, unit_data,
+                                              NEXT_SLAB_PTR_LANE, WARP_WIDTH);
+
+            /** 2.1. Next slab is empty, ABORT **/
+            if (next_slab_ptr == EMPTY_SLAB_PTR) {
+                if (lane_id == src_lane) {
+                    to_search = false;
+                }
+            }
+            /** 2.2. Next slab exists, RESTART **/
+            else {
+                curr_slab_ptr = next_slab_ptr;
+            }
+        }
+
+        prev_work_queue = work_queue;
+    }
+
+    return _make_pair(iterator, mask);
+}
+
+/*
+ * Insert: ABORT if found
+ * replacePair: REPLACE if found
+ * WE DO NOT ALLOW DUPLICATE KEYS
+ */
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ _Pair<ptr_t*, uint8_t>
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::InsertAtomic(
+        uint8_t& to_be_inserted,
+        const uint32_t lane_id,
+        const uint32_t bucket_id,
+        const _Key& key) {
+    uint32_t work_queue = 0;
+    uint32_t prev_work_queue = 0;
+    uint32_t curr_slab_ptr = EMPTY_SLAB_PTR;
+
+    ptr_t* iterator = NULL;
+    uint8_t mask = false;
+
+    const uint32_t first_key = *reinterpret_cast<const uint32_t*>(&key);
+
+    /** > Loop when we have active lanes **/
+    while ((work_queue = __ballot_sync(ACTIVE_LANES_MASK, to_be_inserted))) {
+        /** 0. Restart from linked list head if last insertion is finished
+         * **/
+        uint32_t src_lane = __ffs(work_queue) - 1;
+        uint32_t src_bucket =
+                __shfl_sync(ACTIVE_LANES_MASK, bucket_id, src_lane, WARP_WIDTH);
+
+        curr_slab_ptr =
+                (prev_work_queue != work_queue)
+                        ? *(get_slab_ptr_from_list_head(src_bucket))
+                        : curr_slab_ptr;
+
+        /* Each lane in the warp reads a uint in the slab */
+        uint32_t* unit_data_ptr =
+                  get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id);
+
+        int32_t lane_empty = WarpFindEmpty(unit_data_ptr);
+        _Key src_key;
+        WarpSyncKey(key, src_lane, src_key);
+        int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data_ptr);
+
+        /** Branch 1: key already existing, ABORT **/
+        if (lane_found >= 0) {
+            if (lane_id == src_lane) {
+                /* free memory heap */
+                to_be_inserted = false;
+            }
+        }
+
+        /** Branch 2: empty slot available, try to insert **/
+        else if (lane_empty >= 0) {
+            uint64_t lane_empty_data_ptr = __shfl_sync(ACTIVE_LANES_MASK,
+                reinterpret_cast<uint64_t>(unit_data_ptr), lane_empty, WARP_WIDTH);
+            unit_data_ptr = reinterpret_cast<uint32_t*>(lane_empty_data_ptr);
+            if (lane_id == src_lane) {
+
+                uint32_t old_first_data =
+                        atomicCAS(unit_data_ptr, EMPTY_PAIR_PTR, first_key);
+
+                /** Branch 2.1: SUCCEED **/
+                if (old_first_data == EMPTY_PAIR_PTR) {
+                    // copy the remaining data
+                    _Value value = atomicAdd(cnt_value_, 1);
+
+                    CopyRemainPair(unit_data_ptr, key, value);
+                    to_be_inserted = false;
+
+                    iterator = unit_data_ptr;
+                    mask = true;
+                }
+
+                /** Branch 2.2: failed: RESTART
+                 *  In the consequent attempt,
+                 *  > if the same key was inserted in this slot,
+                 *    we fall back to Branch 1;
+                 *  > if a different key was inserted,
+                 *    we go to Branch 2 or 3.
+                 * **/
+            }
+        }
+
+        /** Branch 3: nothing found in this slab, goto next slab **/
+        else {
+            /* broadcast next slab */
+            ptr_t next_slab_ptr = __shfl_sync(ACTIVE_LANES_MASK, *reinterpret_cast<const ptr_t*>(unit_data_ptr),
+                                              NEXT_SLAB_PTR_LANE, WARP_WIDTH);
+
+            /** Branch 3.1: next slab existing, RESTART this lane **/
+            if (next_slab_ptr != EMPTY_SLAB_PTR) {
+                curr_slab_ptr = next_slab_ptr;
+            }
+
+            /** Branch 3.2: next slab empty, try to allocate one **/
+            else {
+                ptr_t new_next_slab_ptr = AllocateSlab(lane_id);
+
+                if (lane_id == NEXT_SLAB_PTR_LANE) {
+                    const uint32_t* unit_data_ptr =
+                            get_unit_ptr_from_list_nodes(
+                                              curr_slab_ptr,
+                                              NEXT_SLAB_PTR_LANE);
+
+                    ptr_t old_next_slab_ptr =
+                            atomicCAS((unsigned int*)unit_data_ptr,
+                                      EMPTY_SLAB_PTR, new_next_slab_ptr);
+
+                    /** Branch 3.2.1: other thread allocated, RESTART lane
+                     *  In the consequent attempt, goto Branch 2' **/
+                    if (old_next_slab_ptr != EMPTY_SLAB_PTR) {
+                        FreeSlab(new_next_slab_ptr);
+                    }
+                    /** Branch 3.2.2: this thread allocated, RESTART lane,
+                     * 'goto Branch 2' **/
+                }
+            }
+        }
+
+        prev_work_queue = work_queue;
+    }
+
+    return _make_pair(iterator, mask);
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ uint8_t
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::Remove(
+                                             uint8_t& to_be_deleted,
+                                             const uint32_t lane_id,
+                                             const uint32_t bucket_id,
+                                             const _Key& key) {
+    uint32_t work_queue = 0;
+    uint32_t prev_work_queue = 0;
+    uint32_t curr_slab_ptr = EMPTY_SLAB_PTR;
+
+    uint8_t mask = false;
+
+    /** > Loop when we have active lanes **/
+    while ((work_queue = __ballot_sync(ACTIVE_LANES_MASK, to_be_deleted))) {
+        /** 0. Restart from linked list head if last insertion is finished
+         * **/
+        uint32_t src_lane = __ffs(work_queue) - 1;
+        uint32_t src_bucket =
+                __shfl_sync(ACTIVE_LANES_MASK, bucket_id, src_lane, WARP_WIDTH);
+
+        _Key src_key;
+        WarpSyncKey(key, src_lane, src_key);
+
+        curr_slab_ptr =
+                (prev_work_queue != work_queue)
+                        ? *(get_slab_ptr_from_list_head(src_bucket))
+                        : curr_slab_ptr;
+
+        const uint32_t* unit_data_ptr =
+                get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id);
+
+        int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data_ptr);
+
+        /** Branch 1: key found **/
+        if (lane_found >= 0) {
+
+            if (lane_id == src_lane) {
+                uint32_t* unit_data_ptr =
+                        get_unit_ptr_from_list_nodes(curr_slab_ptr,
+                                                           lane_found);
+                ptr_t pair_to_delete = *reinterpret_cast<uint32_t*>(&src_key);
+
+                // TODO: keep in mind the potential double free problem
+                ptr_t old_key_value_pair =
+                        atomicCAS((unsigned int*)(unit_data_ptr),
+                                  pair_to_delete, EMPTY_PAIR_PTR);
+
+                /** Branch 1.1: this thread reset, free src_addr **/
+                if (old_key_value_pair == pair_to_delete) {
+                    ClearRemainPair(unit_data_ptr);
+                    mask = true;
+                }
+                /** Branch 1.2: other thread did the job, avoid double free
+                 * **/
+                to_be_deleted = false;
+            }
+        } else {  // no matching slot found:
+            ptr_t unit_data = *(reinterpret_cast<const ptr_t*>(unit_data_ptr));
+            ptr_t next_slab_ptr = __shfl_sync(ACTIVE_LANES_MASK, unit_data,
+                                              NEXT_SLAB_PTR_LANE, WARP_WIDTH);
+            if (next_slab_ptr == EMPTY_SLAB_PTR) {
+                // not found:
+                if (lane_id == src_lane) {
+                    to_be_deleted = false;
+                }
+            } else {
+                curr_slab_ptr = next_slab_ptr;
+            }
+        }
+        prev_work_queue = work_queue;
+    }
+
+    return mask;
+}
+
+/////////////////
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__device__ void
+SlabHashContext<_Key, _Value, _Hash,
+                _LOG_NUM_MEM_BLOCKS,
+                _LOG_NUM_SUPER_BLOCKS,
+                _MEM_UNIT_WARP_MULTIPLES>::BulkInsertWithMapping(
+        uint8_t& to_be_inserted,
+        const uint32_t lane_id,
+        const uint32_t bucket_id,
+        const _Key& key,
+        int* p_mapping,
+        int* p_inverse_mapping,
+        int key_idx) {
+    uint32_t work_queue = 0;
+    uint32_t prev_work_queue = 0;
+    uint32_t curr_slab_ptr = EMPTY_SLAB_PTR;
+
+    //ptr_t* iterator = NULL;
+    //uint8_t mask = false;
+
+    const uint32_t first_key = *reinterpret_cast<const uint32_t*>(&key);
+
+    /** > Loop when we have active lanes **/
+    while ((work_queue = __ballot_sync(ACTIVE_LANES_MASK, to_be_inserted))) {
+        /** 0. Restart from linked list head if last insertion is finished
+         * **/
+        uint32_t src_lane = __ffs(work_queue) - 1;
+        uint32_t src_bucket =
+                __shfl_sync(ACTIVE_LANES_MASK, bucket_id, src_lane, WARP_WIDTH);
+
+        curr_slab_ptr =
+                (prev_work_queue != work_queue)
+                        ? *(get_slab_ptr_from_list_head(src_bucket))
+                        : curr_slab_ptr;
+
+        /* Each lane in the warp reads a uint in the slab */
+        uint32_t* unit_data_ptr =
+                  get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id);
+
+        int32_t lane_empty = WarpFindEmpty(unit_data_ptr);
+        _Key src_key;
+        WarpSyncKey(key, src_lane, src_key);
+        int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data_ptr);
+
+        /** Branch 1: key already existing, ABORT **/
+        if (lane_found >= 0) {
+            uint64_t found_pair_internal_ptr = __shfl_sync(
+                    ACTIVE_LANES_MASK, reinterpret_cast<uint64_t>(unit_data_ptr), lane_found, WARP_WIDTH);
+
+            if (lane_id == src_lane) {
+                ///
+                p_inverse_mapping[key_idx] = static_cast<int>(
+                    *reinterpret_cast<_Value*>(
+                        reinterpret_cast<ptr_t*>(
+                          found_pair_internal_ptr)
+                        + key_chunks)
+                    );
+                ///
+                to_be_inserted = false;
+            }
+        }
+
+        /** Branch 2: empty slot available, try to insert **/
+        else if (lane_empty >= 0) {
+            uint64_t lane_empty_data_ptr = __shfl_sync(ACTIVE_LANES_MASK,
+                reinterpret_cast<uint64_t>(unit_data_ptr), lane_empty, WARP_WIDTH);
+            unit_data_ptr = reinterpret_cast<uint32_t*>(lane_empty_data_ptr);
+            if (lane_id == src_lane) {
+
+                uint32_t old_first_data =
+                        atomicCAS(unit_data_ptr, EMPTY_PAIR_PTR, first_key);
+
+                /** Branch 2.1: SUCCEED **/
+                if (old_first_data == EMPTY_PAIR_PTR) {
+                    // copy the remaining data
+                    _Value value = atomicAdd(cnt_value_, 1);
+
+                    CopyRemainPair(unit_data_ptr, key, value);
+                    ///
+                    p_mapping[value] = key_idx;
+                    p_inverse_mapping[key_idx] = value;
+                    ///
+                    to_be_inserted = false;
+
+                    //iterator = unit_data_ptr;
+                    //mask = true;
+                }
+
+                /** Branch 2.2: failed: RESTART
+                 *  In the consequent attempt,
+                 *  > if the same key was inserted in this slot,
+                 *    we fall back to Branch 1;
+                 *  > if a different key was inserted,
+                 *    we go to Branch 2 or 3.
+                 * **/
+            }
+        }
+
+        /** Branch 3: nothing found in this slab, goto next slab **/
+        else {
+            /* broadcast next slab */
+            ptr_t next_slab_ptr = __shfl_sync(ACTIVE_LANES_MASK, *reinterpret_cast<const ptr_t*>(unit_data_ptr),
+                                              NEXT_SLAB_PTR_LANE, WARP_WIDTH);
+
+            /** Branch 3.1: next slab existing, RESTART this lane **/
+            if (next_slab_ptr != EMPTY_SLAB_PTR) {
+                curr_slab_ptr = next_slab_ptr;
+            }
+
+            /** Branch 3.2: next slab empty, try to allocate one **/
+            else {
+                ptr_t new_next_slab_ptr = AllocateSlab(lane_id);
+
+                if (lane_id == NEXT_SLAB_PTR_LANE) {
+                    const uint32_t* unit_data_ptr =
+                            get_unit_ptr_from_list_nodes(
+                                              curr_slab_ptr,
+                                              NEXT_SLAB_PTR_LANE);
+
+                    ptr_t old_next_slab_ptr =
+                            atomicCAS((unsigned int*)unit_data_ptr,
+                                      EMPTY_SLAB_PTR, new_next_slab_ptr);
+
+                    /** Branch 3.2.1: other thread allocated, RESTART lane
+                     *  In the consequent attempt, goto Branch 2' **/
+                    if (old_next_slab_ptr != EMPTY_SLAB_PTR) {
+                        FreeSlab(new_next_slab_ptr);
+                    }
+                    /** Branch 3.2.2: this thread allocated, RESTART lane,
+                     * 'goto Branch 2' **/
+                }
+            }
+        }
+
+        prev_work_queue = work_queue;
+    }
+
+//    return _make_pair(iterator, mask);
+}
+
+/////////////////
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void
+SearchKernel(SlabHashContext<_Key, _Value, _Hash,
+                             _LOG_NUM_MEM_BLOCKS,
+                             _LOG_NUM_SUPER_BLOCKS,
+                             _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+             _Key* keys,
+             _Value* values,
+             uint8_t* founds,
+             uint32_t num_queries) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    /* This warp is idle */
+    if ((tid - lane_id) >= num_queries) {
+        return;
+    }
+
+    uint8_t lane_active = false;
+    uint32_t bucket_id = 0;
+    _Key key;
+
+    if (tid < num_queries) {
+        lane_active = true;
+        key = keys[tid];
+        bucket_id = slab_hash_ctx.ComputeBucket(key);
+    }
+
+    _Pair<ptr_t*, uint8_t> result =
+            slab_hash_ctx.Search(lane_active, lane_id, bucket_id, key);
+
+    if (tid < num_queries) {
+        uint8_t found = result.second;
+        founds[tid] = found;
+        values[tid] = found ? reinterpret_cast<_Pair<_Key, _Value>*>(result.first)
+                                      ->second
+                            : _Value(0);
+
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void
+InsertAtomicKernel(SlabHashContext<_Key, _Value, _Hash,
+                                   _LOG_NUM_MEM_BLOCKS,
+                                   _LOG_NUM_SUPER_BLOCKS,
+                                   _MEM_UNIT_WARP_MULTIPLES>
+                                   slab_hash_ctx,
+                   _Key* keys,
+                   uint32_t num_keys,
+                   uint32_t hash_coef) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    if ((tid - lane_id) >= num_keys) {
+        return;
+    }
+
+    slab_hash_ctx.get_slab_alloc_ctx().Init(hash_coef, tid, lane_id);
+
+    uint8_t lane_active = false;
+    uint32_t bucket_id = 0;
+    _Key key;
+
+    if (tid < num_keys) {
+        lane_active = true;
+        key = keys[tid];
+        bucket_id = slab_hash_ctx.ComputeBucket(key);
+    }
+
+    slab_hash_ctx.InsertAtomic(lane_active, lane_id, bucket_id, key);
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void
+RemoveKernel(SlabHashContext<_Key, _Value, _Hash,
+                             _LOG_NUM_MEM_BLOCKS,
+                             _LOG_NUM_SUPER_BLOCKS,
+                             _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+             _Key* keys,
+             uint32_t num_keys) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    if ((tid - lane_id) >= num_keys) {
+        return;
+    }
+
+    uint8_t lane_active = false;
+    uint32_t bucket_id = 0;
+    _Key key;
+
+    if (tid < num_keys) {
+        lane_active = true;
+        key = keys[tid];
+        bucket_id = slab_hash_ctx.ComputeBucket(key);
+    }
+
+    slab_hash_ctx.Remove(lane_active, lane_id, bucket_id, key);
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void
+Search_Kernel(SlabHashContext<_Key, _Value, _Hash,
+                              _LOG_NUM_MEM_BLOCKS,
+                              _LOG_NUM_SUPER_BLOCKS,
+                              _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+              _Key* keys,
+              _Iterator<_Key, _Value>* iterators,
+              uint8_t* masks,
+              uint32_t num_queries) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    /* This warp is idle */
+    if ((tid - lane_id) >= num_queries) {
+        return;
+    }
+
+    uint8_t lane_active = false;
+    uint32_t bucket_id = 0;
+    _Key key;
+
+    if (tid < num_queries) {
+        lane_active = true;
+        key = keys[tid];
+        bucket_id = slab_hash_ctx.ComputeBucket(key);
+    }
+
+    _Pair<ptr_t*, uint8_t> result =
+            slab_hash_ctx.Search(lane_active, lane_id, bucket_id, key);
+
+    if (tid < num_queries) {
+        iterators[tid] = reinterpret_cast<_Pair<_Key, _Value>*>(result.first);
+        masks[tid] = result.second;
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void
+InsertAtomic_Kernel(SlabHashContext<_Key, _Value, _Hash,
+                                    _LOG_NUM_MEM_BLOCKS,
+                                    _LOG_NUM_SUPER_BLOCKS,
+                                    _MEM_UNIT_WARP_MULTIPLES>
+                                    slab_hash_ctx,
+                    _Key* keys,
+                    _Iterator<_Key, _Value>* iterators,
+                    uint8_t* masks,
+                    uint32_t num_keys,
+                    uint32_t hash_coef) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    if ((tid - lane_id) >= num_keys) {
+        return;
+    }
+
+    slab_hash_ctx.get_slab_alloc_ctx().Init(hash_coef, tid, lane_id);
+
+    uint8_t lane_active = false;
+    uint32_t bucket_id = 0;
+    _Key key;
+
+    if (tid < num_keys) {
+        lane_active = true;
+        key = keys[tid];
+        bucket_id = slab_hash_ctx.ComputeBucket(key);
+    }
+
+    _Pair<ptr_t*, uint8_t> result =
+            slab_hash_ctx.InsertAtomic(lane_active, lane_id, bucket_id, key);
+
+    if (tid < num_keys) {
+        iterators[tid] = reinterpret_cast<_Pair<_Key, _Value>*>(result.first);
+        masks[tid] = result.second;
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void
+Remove_Kernel(SlabHashContext<_Key, _Value, _Hash,
+                              _LOG_NUM_MEM_BLOCKS,
+                              _LOG_NUM_SUPER_BLOCKS,
+                              _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+              _Key* keys,
+              uint8_t* masks,
+              uint32_t num_keys) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    if ((tid - lane_id) >= num_keys) {
+        return;
+    }
+
+    uint8_t lane_active = false;
+    uint32_t bucket_id = 0;
+    _Key key;
+
+    if (tid < num_keys) {
+        lane_active = true;
+        key = keys[tid];
+        bucket_id = slab_hash_ctx.ComputeBucket(key);
+    }
+
+    uint8_t success =
+            slab_hash_ctx.Remove(lane_active, lane_id, bucket_id, key);
+
+    if (tid < num_keys) {
+        masks[tid] = success;
+    }
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void GetIteratorsKernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        ptr_t* iterators,
+        uint32_t* iterator_count,
+        uint32_t num_buckets) {
+    // global warp ID
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t wid = tid >> 5;
+    // assigning a warp per bucket
+    if (wid >= num_buckets) {
+        return;
+    }
+
+    /* uint32_t lane_id = threadIdx.x & 0x1F; */
+
+    /* // initializing the memory allocator on each warp: */
+    /* slab_hash_ctx.get_slab_alloc_ctx().Init(tid, lane_id); */
+
+    /* uint32_t src_unit_data = */
+    /*         *slab_hash_ctx.get_unit_ptr_from_list_head(wid, lane_id); */
+    /* uint32_t active_mask = */
+    /*         __ballot_sync(PAIR_PTR_LANES_MASK, src_unit_data !=
+     * EMPTY_PAIR_PTR); */
+    /* int leader = __ffs(active_mask) - 1; */
+    /* uint32_t count = __popc(active_mask); */
+    /* uint32_t rank = __popc(active_mask & __lanemask_lt()); */
+    /* uint32_t prev_count; */
+    /* if (rank == 0) { */
+    /*     prev_count = atomicAdd(iterator_count, count); */
+    /* } */
+    /* prev_count = __shfl_sync(active_mask, prev_count, leader); */
+
+    /* if (src_unit_data != EMPTY_PAIR_PTR) { */
+    /*     iterators[prev_count + rank] = src_unit_data; */
+    /* } */
+
+    /* uint32_t next = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); */
+    /* while (next != EMPTY_SLAB_PTR) { */
+    /*     src_unit_data = */
+    /*             *slab_hash_ctx.get_unit_ptr_from_list_nodes(next,
+     * lane_id);
+     */
+    /*     count += __popc(__ballot_sync(PAIR_PTR_LANES_MASK, */
+    /*                                   src_unit_data != EMPTY_PAIR_PTR));
+     */
+    /*     next = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); */
+    /* } */
+    /* // writing back the results: */
+    /* if (lane_id == 0) { */
+    /* } */
+}
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void CountElemsKernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        uint32_t* values, uint32_t* index, int* count) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    // assigning a warp per bucket
+    uint32_t wid = tid >> 5;
+    if (wid >= slab_hash_ctx.bucket_size()) {
+        return;
+    }
+
+    uint32_t* src_unit_data_ptr = NULL;
+    ptr_t next =
+            *slab_hash_ctx.get_slab_ptr_from_list_head(wid);
+
+    // count following nodes
+    while (next != EMPTY_SLAB_PTR) {
+//        /*
+        src_unit_data_ptr =
+                slab_hash_ctx.get_unit_ptr_from_list_nodes(next, lane_id);
+        if (NEXT_SLAB_PTR_LANE != lane_id &&
+            src_unit_data_ptr[slab_hash_ctx.key_chunks + slab_hash_ctx.value_chunks - 1] != EMPTY_PAIR_PTR) {
+            values[atomicAdd(index, 1)] = src_unit_data_ptr[slab_hash_ctx.key_chunks + slab_hash_ctx.value_chunks - 1];
+            /*
+            printf("%d %d %d %d\n", src_unit_data_ptr[3],
+                                    src_unit_data_ptr[0],
+                                    src_unit_data_ptr[1],
+                                    src_unit_data_ptr[2]);
+            */
+        }
+
+//        /*
+        ///////////
+        // TODO(ljm): Warning: handle protential overflow
+        for (int d = 0; d != 3; ++d) {
+          _Key key = reinterpret_cast<_Pair<_Key, _Value>*>(src_unit_data_ptr)->first;
+          key[d] += 1;
+          uint8_t lane_active = (NEXT_SLAB_PTR_LANE != lane_id) &&
+                                (src_unit_data_ptr[_MEM_UNIT_WARP_MULTIPLES - 1] != EMPTY_PAIR_PTR);
+          uint32_t bucket_id = slab_hash_ctx.ComputeBucket(key);
+          _Pair<ptr_t*, uint8_t> result =
+                slab_hash_ctx.Search(lane_active, lane_id, bucket_id, key);
+          if (result.second) {
+              atomicSub(count + d, 1);
+              /*
+              printf("found key[%d] + 1: %d\t%d\t%d --- %d\t%d\t%d\n",
+                      d, key[0], key[1], key[2],
+                      result.first[0], result.first[1], result.first[2]);
+                      */
+          }
+        }
+        ///////////
+//        */
+        next = __shfl_sync(ACTIVE_LANES_MASK, *src_unit_data_ptr, NEXT_SLAB_PTR_LANE,
+                           WARP_WIDTH);
+    }
+}
+
+/*
+ * This kernel can be used to compute total number of elements within each
+ * bucket. The final results per bucket is stored in d_count_result array
+ */
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void CountElemsPerBucketKernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        uint32_t* bucket_elem_counts, uint32_t hash_coef) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    // assigning a warp per bucket
+    uint32_t wid = tid >> 5;
+    if (wid >= slab_hash_ctx.bucket_size()) {
+        return;
+    }
+
+    uint32_t count = 0;
+
+    uint32_t src_unit_data = EMPTY_PAIR_PTR;
+    ptr_t next =
+            *slab_hash_ctx.get_slab_ptr_from_list_head(wid);
+
+    // count following nodes
+    while (next != EMPTY_SLAB_PTR) {
+        src_unit_data =
+                *slab_hash_ctx.get_unit_ptr_from_list_nodes(next, lane_id);
+        count += __popc(__ballot_sync(PAIR_PTR_LANES_MASK,
+                                      src_unit_data != EMPTY_PAIR_PTR));
+        next = __shfl_sync(ACTIVE_LANES_MASK, src_unit_data, NEXT_SLAB_PTR_LANE,
+                           WARP_WIDTH);
+    }
+
+    // write back the results:
+    if (lane_id == 0) {
+        bucket_elem_counts[wid] = count;
+    }
+}
+
+////////////////
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void
+BulkInsertWithMappingKernel(SlabHashContext<_Key, _Value, _Hash,
+                                   _LOG_NUM_MEM_BLOCKS,
+                                   _LOG_NUM_SUPER_BLOCKS,
+                                   _MEM_UNIT_WARP_MULTIPLES>
+                                   slab_hash_ctx,
+//                   _Key* keys,
+                   const int* p_coords,
+                   int* p_mapping,
+                   int* p_inverse_mapping,
+                   uint32_t num_keys,
+                   uint32_t hash_coef) {
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t lane_id = threadIdx.x & 0x1F;
+
+    if ((tid - lane_id) >= num_keys) {
+        return;
+    }
+
+    slab_hash_ctx.get_slab_alloc_ctx().Init(hash_coef, tid, lane_id);
+
+    uint8_t lane_active = false;
+    uint32_t bucket_id = 0;
+    _Key key;
+
+    if (tid < num_keys) {
+        lane_active = true;
+//        key = keys[tid];
+        key = *(reinterpret_cast<const _Key*>(p_coords + tid *
+                 SlabHashContext<_Key, _Value, _Hash,
+                                 _LOG_NUM_MEM_BLOCKS,
+                                 _LOG_NUM_SUPER_BLOCKS,
+                                 _MEM_UNIT_WARP_MULTIPLES>::key_chunks));
+        bucket_id = slab_hash_ctx.ComputeBucket(key);
+    }
+
+    slab_hash_ctx.BulkInsertWithMapping(lane_active, lane_id, bucket_id, key,
+                                        p_mapping, p_inverse_mapping, tid);
+    /*
+    p_inverse_mapping[tid] = static_cast<int>(idx);
+    p_mapping[cnt] = static_cast<int>(tid);
+    */
+}
+////////////////
+using Key = Coordinate<int, 4>;
+template class SlabHash<Key, int, hash<Key>, CudaAllocator, 5, 5, 5>;
+template class SlabHashContext<Key, int, hash<Key>, 5, 5, 5>;
+} // namespace slab_hash
diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.h b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.h
new file mode 100644
index 00000000..cac15a39
--- /dev/null
+++ b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.h
@@ -0,0 +1,389 @@
+/*
+ * Copyright 2019 Saman Ashkiani
+ * Modified 2019 by Wei Dong
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/pair.h>
+#include <thrust/device_vector.h>
+#include <cassert>
+#include <memory>
+#include <random>
+#include <time.h>
+
+#include "slab_alloc.h"
+
+template <typename _Key, typename _Value>
+struct _Pair {
+    _Key first;
+    _Value second;
+    __device__ __host__ _Pair(const _Key& key, const _Value& value)
+        : first(key), second(value) {}
+    __device__ __host__ _Pair() : first(), second() {}
+};
+
+template <typename _Key, typename _Value>
+__device__ __host__ _Pair<_Key, _Value> _make_pair(const _Key& key,
+                                                   const _Value& value) {
+    return _Pair<_Key, _Value>(key, value);
+}
+
+template <typename _Key, typename _Value>
+using _Iterator = _Pair<_Key, _Value>*;
+
+namespace slab_hash {
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+class SlabHashContext;
+
+template <typename _Key, typename _Value, typename _Hash, class _Alloc,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+class SlabHash {
+public:
+    SlabHash(const uint32_t max_bucket_count,
+             const uint32_t max_keyvalue_count,
+             uint32_t device_idx);
+
+    ~SlabHash();
+
+    _Value Size();
+    _Value* SizePtr();
+    /* Simplistic output: no iterators, and success mask is only provided
+     * for search.
+     * All the outputs are READ ONLY: change to these output will NOT change the
+     * internal hash table.
+     */
+    void InsertAtomic(_Key* input_keys, uint32_t num_keys);
+    void Search(_Key* input_keys,
+                _Value* output_values,
+                uint8_t* output_masks,
+                uint32_t num_keys);
+    void Remove(_Key* input_keys, uint32_t num_keys);
+
+    /* Verbose output (similar to std): return success masks for all operations,
+
+    * and iterators for insert and search (not for remove operation, as
+     * iterators are invalid after erase).
+     * Output iterators supports READ/WRITE: change to these output will
+     * DIRECTLY change the internal hash table.
+     */
+    void InsertAtomic_(_Key* input_keys,
+                 _Iterator<_Key, _Value>* output_iterators,
+                 uint8_t* output_masks,
+                 uint32_t num_keys);
+    void Search_(_Key* input_keys,
+                 _Iterator<_Key, _Value>* output_iterators,
+                 uint8_t* output_masks,
+                 uint32_t num_keys);
+    void Remove_(_Key* input_keys, uint8_t* output_masks, uint32_t num_keys);
+
+    ////////////////
+    void BulkInsertWithMapping(const int* p_coords,
+                   int* p_mapping,
+                   int* p_inverse_mapping,
+                   int size);
+    void IterateKeys(int* p_coords, int size);
+    void IterateSearchAtBatch(int* p_out, int batch_index, int size);
+    void IterateSearchPerBatch(const std::vector<int*>& p_outs, int size);
+    void IterateOffsetInsert(const std::shared_ptr<SlabHash<_Key, _Value,
+                                             _Hash, _Alloc,
+                                             _LOG_NUM_MEM_BLOCKS,
+                                             _LOG_NUM_SUPER_BLOCKS,
+                                             _MEM_UNIT_WARP_MULTIPLES>>& in_map,
+                         int* p_offset, int size);
+    ////////////////
+
+    /* Debug usages */
+    std::vector<int> CountElemsPerBucket();
+
+    void CountElems(int* count);
+
+    double ComputeLoadFactor();
+
+private:
+    ptr_t* bucket_list_head_;
+    uint32_t num_buckets_;
+
+    _Value* cnt_value_;
+
+    SlabHashContext<_Key, _Value, _Hash,
+                    _LOG_NUM_MEM_BLOCKS,
+                    _LOG_NUM_SUPER_BLOCKS,
+                    _MEM_UNIT_WARP_MULTIPLES> gpu_context_;
+
+    std::shared_ptr<_Alloc> allocator_;
+    SlabAlloc<_Alloc,
+              _LOG_NUM_MEM_BLOCKS,
+              _LOG_NUM_SUPER_BLOCKS,
+              _MEM_UNIT_WARP_MULTIPLES>* slab_list_allocator_;
+
+    uint32_t device_idx_;
+
+    uint32_t hash_coef_;
+};
+
+/** Lite version **/
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void InsertAtomicKernel(SlabHashContext<_Key, _Value, _Hash,
+                                                   _LOG_NUM_MEM_BLOCKS,
+                                                   _LOG_NUM_SUPER_BLOCKS,
+                                                   _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+                             _Key* input_keys,
+                             uint32_t num_keys,
+                             uint32_t hash_coef);
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void SearchKernel(SlabHashContext<_Key, _Value, _Hash,
+                                             _LOG_NUM_MEM_BLOCKS,
+                                             _LOG_NUM_SUPER_BLOCKS,
+                                             _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+                             _Key* input_keys,
+                             _Value* output_values,
+                             uint8_t* output_masks,
+                             uint32_t num_keys);
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void RemoveKernel(SlabHashContext<_Key, _Value, _Hash,
+                                             _LOG_NUM_MEM_BLOCKS,
+                                             _LOG_NUM_SUPER_BLOCKS,
+                                             _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+                             _Key* input_keys,
+                             uint32_t num_keys);
+
+/** Verbose version **/
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void InsertAtomic_Kernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        _Key* input_keys,
+        _Iterator<_Key, _Value>* output_iterators,
+        uint8_t* output_masks,
+        uint32_t num_keys,
+        uint32_t hash_coef);
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void Search_Kernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        _Key* input_keys,
+        _Iterator<_Key, _Value>* output_iterators,
+        uint8_t* output_masks,
+        uint32_t num_keys);
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void Remove_Kernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        _Key* input_keys,
+        uint8_t* output_masks,
+        uint32_t num_keys);
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void GetIteratorsKernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        _Iterator<_Key, _Value>* output_iterators,
+        uint32_t* output_iterator_count,
+        uint32_t num_buckets);
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void CountElemsPerBucketKernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        uint32_t* bucket_elem_counts, uint32_t hash_coef);
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void CountElemsKernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        uint32_t* values, uint32_t* index);
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void InitKernel(
+        SlabHashContext<_Key, _Value, _Hash,
+                        _LOG_NUM_MEM_BLOCKS,
+                        _LOG_NUM_SUPER_BLOCKS,
+                        _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+        const uint32_t num_threads,
+        uint32_t hash_coef);
+
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void ReleaseKernel(const uint32_t num_threads);
+
+///////////////
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+__global__ void BulkInsertWithMappingKernel(SlabHashContext<_Key, _Value, _Hash,
+                                                   _LOG_NUM_MEM_BLOCKS,
+                                                   _LOG_NUM_SUPER_BLOCKS,
+                                                   _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx,
+//                             _Key* input_keys,
+                             const int* p_coords,
+                             int* p_mapping,
+                             int* p_inverse_mapping,
+                             uint32_t num_keys,
+                             uint32_t hash_coef);
+///////////////
+
+/**
+ * Internal implementation for the device proxy:
+ * DO NOT ENTER!
+ **/
+template <typename _Key, typename _Value, typename _Hash,
+          uint32_t _LOG_NUM_MEM_BLOCKS,
+          uint32_t _LOG_NUM_SUPER_BLOCKS,
+          uint32_t _MEM_UNIT_WARP_MULTIPLES>
+class SlabHashContext {
+public:
+   using SlabAllocContextT = SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                                              _LOG_NUM_SUPER_BLOCKS,
+                                              _MEM_UNIT_WARP_MULTIPLES>;
+public:
+    SlabHashContext();
+    __host__ void Setup(
+            ptr_t* bucket_list_head,
+            const uint32_t num_buckets,
+            _Value* cnt_value,
+            const SlabAllocContextT& allocator_ctx) {
+    bucket_list_head_ = bucket_list_head;
+    num_buckets_ = num_buckets;
+    cnt_value_ = cnt_value;
+    slab_list_allocator_ctx_ = allocator_ctx;
+    }
+
+    /* Core SIMT operations, shared by both simplistic and verbose
+     * interfaces */
+    __device__ _Pair<ptr_t*, uint8_t> InsertAtomic(uint8_t& lane_active,
+                                            const uint32_t lane_id,
+                                            const uint32_t bucket_id,
+                                            const _Key& key);
+    __device__ _Pair<ptr_t*, uint8_t> Search(uint8_t& lane_active,
+                                            const uint32_t lane_id,
+                                            const uint32_t bucket_id,
+                                            const _Key& key);
+
+    __device__ uint8_t Remove(uint8_t& lane_active,
+                              const uint32_t lane_id,
+                              const uint32_t bucket_id,
+                              const _Key& key);
+
+    /////////////////
+    __device__ void BulkInsertWithMapping(uint8_t& lane_active,
+                                 const uint32_t lane_id,
+                                 const uint32_t bucket_id,
+                                 const _Key& key,
+                                 int* p_mapping,
+                                 int* p_inverse_mapping,
+                                 int key_idx);
+    /////////////////
+
+    /* Hash function */
+    __device__ __host__ uint32_t ComputeBucket(const _Key& key) const;
+    __device__ __host__ uint32_t bucket_size() const { return num_buckets_; }
+
+    __device__ __host__ SlabAllocContextT& get_slab_alloc_ctx() {
+        return slab_list_allocator_ctx_;
+    }
+
+    __device__ __forceinline__ ptr_t* get_unit_ptr_from_list_nodes(
+            const ptr_t slab_ptr, const uint32_t lane_id) {
+        return slab_list_allocator_ctx_.get_unit_ptr_from_slab(slab_ptr,
+                                                               lane_id);
+    }
+    __device__ __forceinline__ ptr_t* get_slab_ptr_from_list_head(
+            const uint32_t bucket_id) {
+        return bucket_list_head_ + bucket_id;
+    }
+    __device__ __forceinline__ void ClearRemainPair(ptr_t* unit_data_ptr);
+
+private:
+    __device__ __forceinline__ void CopyRemainPair(ptr_t* unit_data_ptr,
+                                                   const _Key& key,
+                                                   const _Value& value);
+    __device__ __forceinline__ void WarpSyncKey(const _Key& key,
+                                                const uint32_t lane_id,
+                                                _Key& ret);
+    __device__ __forceinline__ int32_t WarpFindKey(const _Key& src_key,
+                                                   const uint32_t lane_id,
+                                                   const ptr_t* unit_data_ptr);
+    __device__ __forceinline__ int32_t WarpFindEmpty(const ptr_t* unit_data_ptr);
+
+    __device__ __forceinline__ ptr_t AllocateSlab(const uint32_t lane_id);
+    __device__ __forceinline__ void FreeSlab(const ptr_t slab_ptr);
+
+private:
+    uint32_t num_buckets_;
+    _Hash hash_fn_;
+
+    ptr_t* bucket_list_head_;
+    _Value* cnt_value_;
+    SlabAllocContext<_LOG_NUM_MEM_BLOCKS,
+                     _LOG_NUM_SUPER_BLOCKS,
+                     _MEM_UNIT_WARP_MULTIPLES> slab_list_allocator_ctx_;
+
+public:
+    static constexpr uint32_t key_chunks = sizeof(_Key) / sizeof(uint32_t);
+    static constexpr uint32_t value_chunks = sizeof(_Value) / sizeof(uint32_t);
+};
+
+
+} // namespace slab_hash
diff --git a/src/broadcast.cpp b/src/broadcast.cpp
index a03160c1..04435f1c 100644
--- a/src/broadcast.cpp
+++ b/src/broadcast.cpp
@@ -88,11 +88,11 @@ at::Tensor BroadcastForwardGPU(at::Tensor in_feat, at::Tensor in_feat_glob,
                                int op, py::object py_in_coords_key,
                                py::object py_glob_coords_key,
                                py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   // Both coords must exist
   // Use the global pooling mapping
-  const auto &in_out = p_coords_manager->getOriginInOutMapsGPU(
+  const InOutMapKey map_key = p_coords_manager->getOriginInOutMaps(
       py_in_coords_key, py_glob_coords_key);
 
   auto out_feat =
@@ -104,7 +104,9 @@ at::Tensor BroadcastForwardGPU(at::Tensor in_feat, at::Tensor in_feat_glob,
   BroadcastForwardKernelGPU<Dtype, int>(
       in_feat.data<Dtype>(), in_feat.size(0), in_feat_glob.data<Dtype>(),
       in_feat_glob.size(0), out_feat.data<Dtype>(), in_feat.size(1), op,
-      in_out.first, in_out.second, handle, at::cuda::getCurrentCUDAStream());
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
+      handle, at::cuda::getCurrentCUDAStream());
 
   return out_feat;
 }
@@ -116,13 +118,13 @@ void BroadcastBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat,
                           py::object py_in_coords_key,
                           py::object py_glob_coords_key,
                           py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   const InOutMapKey map_key = p_coords_manager->getOriginMapHashKey(
       py_in_coords_key, py_glob_coords_key);
 
-  ASSERT(p_coords_manager->d_in_maps.find(map_key) !=
-             p_coords_manager->d_in_maps.end(),
+  ASSERT(p_coords_manager->in_maps.find(map_key) !=
+             p_coords_manager->in_maps.end(),
          "The in-out map doesn't exist for backward. Did you run forward pass?")
 
   grad_in_feat.resize_as_(in_feat);
@@ -137,8 +139,8 @@ void BroadcastBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat,
       in_feat.data<Dtype>(), grad_in_feat.data<Dtype>(), in_feat.size(0),
       in_feat_glob.data<Dtype>(), grad_in_feat_glob.data<Dtype>(),
       in_feat_glob.size(0), grad_out_feat.data<Dtype>(), in_feat.size(1), op,
-      p_coords_manager->d_in_maps[map_key],
-      p_coords_manager->d_out_maps[map_key], handle,
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key], handle,
       at::cuda::getCurrentCUDAStream());
 }
 #endif
@@ -166,23 +168,23 @@ template void BroadcastBackwardCPU<CoordsToIndexMap, double>(
     py::object py_coords_manager);
 
 #ifndef CPU_ONLY
-template at::Tensor BroadcastForwardGPU<CoordsToIndexMap, float>(
+template at::Tensor BroadcastForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor in_feat_glob, int op,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template at::Tensor BroadcastForwardGPU<CoordsToIndexMap, double>(
+template at::Tensor BroadcastForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor in_feat_glob, int op,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void BroadcastBackwardGPU<CoordsToIndexMap, float>(
+template void BroadcastBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor in_feat_glob,
     at::Tensor grad_in_feat_glob, at::Tensor grad_out_feat, int op,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void BroadcastBackwardGPU<CoordsToIndexMap, double>(
+template void BroadcastBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor in_feat_glob,
     at::Tensor grad_in_feat_glob, at::Tensor grad_out_feat, int op,
     py::object py_in_coords_key, py::object py_out_coords_key,
diff --git a/src/broadcast.cu b/src/broadcast.cu
index b74dd136..4a9d89f7 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -80,13 +80,13 @@ template <typename Dtype, typename Itype>
 void BroadcastForwardKernelGPU(
     const Dtype *d_in_feat, int in_nrows, const Dtype *d_in_feat_global,
     int in_nrows_global, Dtype *d_out_feat, int nchannel, int op,
-    const pInOutMaps<Itype> &in_maps, const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cusparseHandle_t cushandle, cudaStream_t stream) {
 
   // Sum all sizes
   int num_map = 0;
   for (const auto &in_map : in_maps)
-    num_map += in_map.size();
+    num_map += in_map.size(0);
   if (num_map != in_nrows)
     throw std::invalid_argument("Invalid in_map");
 
@@ -100,14 +100,14 @@ void BroadcastForwardKernelGPU(
   case 0: // +
     channelwise_addition<Dtype, Itype>
         <<<GET_BLOCKS(in_nrows), CUDA_NUM_THREADS, 0, stream>>>(
-            in_nrows, nchannel, d_in_feat_global, in_maps[0].data(),
-            out_maps[0].data(), d_out_feat);
+            in_nrows, nchannel, d_in_feat_global, in_maps[0].data<Itype>(),
+            out_maps[0].data<Itype>(), d_out_feat);
     break;
   case 1: // *
     channelwise_multiplication<Dtype, Itype>
         <<<GET_BLOCKS(in_nrows), CUDA_NUM_THREADS, 0, stream>>>(
-            in_nrows, nchannel, d_in_feat_global, in_maps[0].data(),
-            out_maps[0].data(), d_out_feat);
+            in_nrows, nchannel, d_in_feat_global, in_maps[0].data<Itype>(),
+            out_maps[0].data<Itype>(), d_out_feat);
     break;
   default:
     throw std::invalid_argument(Formatter() << "Operation not supported: "
@@ -121,13 +121,13 @@ void BroadcastForwardKernelGPU(
 template void BroadcastForwardKernelGPU<float, int32_t>(
     const float *d_in_feat, int in_nrows, const float *d_in_feat_global,
     int in_nrows_global, float *d_out_feat, int nchannel, int op,
-    const pInOutMaps<int32_t> &in_map, const pInOutMaps<int32_t> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cusparseHandle_t cuhandle, cudaStream_t stream);
 
 template void BroadcastForwardKernelGPU<double, int32_t>(
     const double *d_in_feat, int in_nrows, const double *d_in_feat_global,
     int in_nrows_global, double *d_out_feat, int nchannel, int op,
-    const pInOutMaps<int32_t> &in_map, const pInOutMaps<int32_t> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cusparseHandle_t cuhandle, cudaStream_t stream);
 
 template <typename Dtype, typename Itype>
@@ -135,7 +135,7 @@ void BroadcastBackwardKernelGPU(
     const Dtype *d_in_feat, Dtype *d_grad_in_feat, int in_nrows,
     const Dtype *d_in_feat_global, Dtype *d_grad_in_feat_global,
     int in_nrows_global, const Dtype *d_grad_out_feat, int nchannel, int op,
-    const pInOutMaps<Itype> &in_maps, const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cusparseHandle_t cushandle, cudaStream_t stream) {
   Itype *d_scr, *d_in_map, *d_out_map, *d_csr_row;
   Dtype *d_dtype, *d_csr_val, *d_tmp_grad_in_feat_global, *d_tmp_grad_in_feat;
@@ -155,7 +155,7 @@ void BroadcastBackwardKernelGPU(
   // Sum all sizes
   int num_map = 0;
   for (const auto &in_map : in_maps)
-    num_map += in_map.size();
+    num_map += in_map.size(0);
   if (num_map != in_nrows)
     throw std::invalid_argument("Invalid in_map");
 
@@ -175,12 +175,12 @@ void BroadcastBackwardKernelGPU(
   d_csr_row = d_scr + 2 * nnz; // in_nrows_global + 1
 
   CUDA_CHECK(cudaMemcpy(d_in_map,
-                        in_maps[0].data(), // in_maps are contiguous of size nnz
+                        in_maps[0].data<Itype>(), // in_maps are contiguous of size nnz
                         nnz * sizeof(int), cudaMemcpyDeviceToDevice));
 
   CUDA_CHECK(
       cudaMemcpy(d_out_map,
-                 out_maps[0].data(), // out_maps are contiguous of size nnz
+                 out_maps[0].data<Itype>(), // out_maps are contiguous of size nnz
                  nnz * sizeof(int), cudaMemcpyDeviceToDevice));
 
   /* tmp in out feat */
@@ -311,14 +311,14 @@ template void BroadcastBackwardKernelGPU<float, int32_t>(
     const float *d_in_feat, float *d_grad_in_feat, int in_nrows,
     const float *d_in_feat_global, float *d_grad_in_feat_global,
     int in_nrows_global, const float *d_grad_out_feat, int nchannel, int op,
-    const pInOutMaps<int32_t> &in_map, const pInOutMaps<int32_t> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cusparseHandle_t cushandle, cudaStream_t stream);
 
 template void BroadcastBackwardKernelGPU<double, int32_t>(
     const double *d_in_feat, double *d_grad_in_feat, int in_nrows,
     const double *d_in_feat_global, double *d_grad_in_feat_global,
     int in_nrows_global, const double *d_grad_out_feat, int nchannel, int op,
-    const pInOutMaps<int32_t> &in_map, const pInOutMaps<int32_t> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cusparseHandle_t cushandle, cudaStream_t stream);
 
 } // namespace minkowski
diff --git a/src/broadcast.cuh b/src/broadcast.cuh
index 68817130..b28a2ec0 100644
--- a/src/broadcast.cuh
+++ b/src/broadcast.cuh
@@ -28,6 +28,7 @@
 #include <array>
 #include <cusparse_v2.h>
 #include <vector>
+#include <torch/extension.h>
 
 #include "gpu.cuh"
 #include "gpu_memory_manager.hpp"
@@ -41,8 +42,7 @@ void BroadcastForwardKernelGPU(const Dtype *d_in_feat, int in_nrows,
                                const Dtype *d_in_feat_global,
                                int in_nrows_global, Dtype *d_out_feat,
                                int nchannel, int op,
-                               const pInOutMaps<Itype> &d_in_map,
-                               const pInOutMaps<Itype> &d_out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                                cusparseHandle_t cushandle, cudaStream_t stream);
 
 template <typename Dtype, typename Itype>
@@ -50,7 +50,7 @@ void BroadcastBackwardKernelGPU(
     const Dtype *d_in_feat, Dtype *d_grad_in_feat, int in_nrows,
     const Dtype *d_in_feat_global, Dtype *d_grad_in_feat_global,
     int in_nrows_global, const Dtype *d_grad_out_feat, int nchannel, int op,
-    const pInOutMaps<Itype> &d_in_map, const pInOutMaps<Itype> &d_out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cusparseHandle_t cushandle, cudaStream_t stream);
 
 } // namespace minkowski
diff --git a/src/common.hpp b/src/common.hpp
index 4666f3d9..c3ea6343 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -32,6 +32,7 @@
 #include <torch/extension.h>
 
 #include "coords_manager.hpp"
+#include "gpu_coords_manager.hpp"
 #include "types.hpp"
 #include "utils.hpp"
 
diff --git a/src/convolution.cpp b/src/convolution.cpp
index b2750f36..79c3ac6b 100644
--- a/src/convolution.cpp
+++ b/src/convolution.cpp
@@ -101,9 +101,9 @@ void ConvolutionForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
                            at::Tensor offsets, py::object py_in_coords_key,
                            py::object py_out_coords_key,
                            py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
-  const auto &in_out = p_coords_manager->getInOutMapsGPU(
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
+  const InOutMapKey map_key = p_coords_manager->getInOutMaps(
       tensor_strides, strides, kernel_sizes, dilations, region_type, offsets,
       py_in_coords_key, py_out_coords_key, false);
 
@@ -120,7 +120,10 @@ void ConvolutionForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
   ConvolutionForwardKernelGPU<Dtype, int>(
       in_feat.template data<Dtype>(), in_feat.size(1),
       out_feat.template data<Dtype>(), out_feat.size(1),
-      kernel.template data<Dtype>(), in_out.first, in_out.second, out_nrows,
+      kernel.template data<Dtype>(),
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
+      out_nrows,
       handle, at::cuda::getCurrentCUDAStream());
 }
 
@@ -133,14 +136,14 @@ void ConvolutionBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat,
                             py::object py_in_coords_key,
                             py::object py_out_coords_key,
                             py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   const InOutMapKey map_key = p_coords_manager->getMapHashKey(
       tensor_strides, strides, kernel_sizes, dilations, region_type,
       py_in_coords_key, py_out_coords_key, false, false);
 
-  ASSERT(p_coords_manager->d_in_maps.find(map_key) !=
-             p_coords_manager->d_in_maps.end(),
+  ASSERT(p_coords_manager->in_maps.find(map_key) !=
+             p_coords_manager->in_maps.end(),
          "The in-out map doesn't exist for backward. Did you run forward pass?")
 
   grad_in_feat.resize_as_(in_feat);
@@ -155,8 +158,10 @@ void ConvolutionBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat,
       in_feat.template data<Dtype>(), grad_in_feat.template data<Dtype>(),
       in_feat.size(1), grad_out_feat.template data<Dtype>(),
       grad_out_feat.size(1), kernel.template data<Dtype>(),
-      grad_kernel.template data<Dtype>(), p_coords_manager->d_in_maps[map_key],
-      p_coords_manager->d_out_maps[map_key], grad_out_feat.size(0), handle,
+      grad_kernel.template data<Dtype>(),
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
+      grad_out_feat.size(0), handle,
       at::cuda::getCurrentCUDAStream());
 }
 #endif // end CPU_ONLY
@@ -190,27 +195,27 @@ template void ConvolutionBackwardCPU<CoordsToIndexMap, double>(
     py::object py_coords_manager);
 
 #ifndef CPU_ONLY
-template void ConvolutionBackwardGPU<CoordsToIndexMap, float>(
+template void ConvolutionBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor kernel, at::Tensor grad_kernel, vector<int> tensor_strides,
     vector<int> strides, vector<int> kernel_sizes, vector<int> dilations,
     int region_type, py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void ConvolutionBackwardGPU<CoordsToIndexMap, double>(
+template void ConvolutionBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor kernel, at::Tensor grad_kernel, vector<int> tensor_strides,
     vector<int> strides, vector<int> kernel_sizes, vector<int> dilations,
     int region_type, py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
-template void ConvolutionForwardGPU<CoordsToIndexMap, float>(
+template void ConvolutionForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor kernel,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void ConvolutionForwardGPU<CoordsToIndexMap, double>(
+template void ConvolutionForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor kernel,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
diff --git a/src/convolution.cu b/src/convolution.cu
index daf3edfe..2e5ed7bb 100644
--- a/src/convolution.cu
+++ b/src/convolution.cu
@@ -36,6 +36,10 @@
  * Matrix multiplication (CUDA Kernel) on the device: C = A * B
  * wA is A's width and wB is B's width
  */
+/////////////
+// TODO(ljm): fix offset logic BUG, though MAX_GRID is large enough
+//
+/////////////
 template <typename Dtype, typename Itype, int BLOCK_SIZE>
 __global__ void matmul(const Dtype *A, const int wA, const int hA,
                        const Dtype *B, const int wB, const int hB, Dtype *C,
@@ -213,8 +217,7 @@ template <typename Dtype, typename Itype>
 void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel,
                                  Dtype *d_out_feat, int out_nchannel,
                                  const Dtype *d_kernel,
-                                 const pInOutMaps<Itype> &in_maps,
-                                 const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                                  int out_nrows, cublasHandle_t cuhandle,
                                  cudaStream_t stream) {
 
@@ -240,7 +243,7 @@ void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel,
 
   // Iterate through each spatial kernel and get indices for in_map and out_map
   for (int k = 0; k < in_maps.size(); k++) {
-    n_active_in_volume = in_maps[k].size();
+    n_active_in_volume = in_maps[k].size(0);
     if (n_active_in_volume == 0)
       continue;
 
@@ -258,25 +261,25 @@ void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel,
         matmul<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
             d_in_feat, in_nchannel, curr_num_active,
             &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-            in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data());
+            in_nchannel, d_out_feat, in_maps[k].data<Itype>(), out_maps[k].data<Itype>());
         break;
       case 24:
         matmul<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
             d_in_feat, in_nchannel, curr_num_active,
             &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-            in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data());
+            in_nchannel, d_out_feat, in_maps[k].data<Itype>(), out_maps[k].data<Itype>());
         break;
       case 16:
         matmul<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
             d_in_feat, in_nchannel, curr_num_active,
             &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-            in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data());
+            in_nchannel, d_out_feat, in_maps[k].data<Itype>(), out_maps[k].data<Itype>());
         break;
       case 8:
         matmul<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
             d_in_feat, in_nchannel, curr_num_active,
             &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-            in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data());
+            in_nchannel, d_out_feat, in_maps[k].data<Itype>(), out_maps[k].data<Itype>());
         break;
       }
     }
@@ -287,14 +290,16 @@ void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel,
 
 template void ConvolutionForwardKernelGPU<float, int32_t>(
     const float *d_in_feat, int in_nchannel, float *d_out_feat,
-    int out_nchannel, const float *d_kernel, const pInOutMaps<int32_t> &in_map,
-    const pInOutMaps<int32_t> &out_map, int out_nrows, cublasHandle_t cuhandle,
+    int out_nchannel, const float *d_kernel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    int out_nrows, cublasHandle_t cuhandle,
     cudaStream_t stream);
 
 template void ConvolutionForwardKernelGPU<double, int32_t>(
     const double *d_in_feat, int in_nchannel, double *d_out_feat,
-    int out_nchannel, const double *d_kernel, const pInOutMaps<int32_t> &in_map,
-    const pInOutMaps<int32_t> &out_map, int out_nrows, cublasHandle_t cuhandle,
+    int out_nchannel, const double *d_kernel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    int out_nrows, cublasHandle_t cuhandle,
     cudaStream_t stream);
 
 template <typename Dtype, typename Itype>
@@ -302,8 +307,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat,
                                   int in_nchannel, const Dtype *d_grad_out_feat,
                                   int out_nchannel, const Dtype *d_kernel,
                                   Dtype *d_grad_kernel,
-                                  const pInOutMaps<Itype> &in_maps,
-                                  const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                                   int out_nrows, cublasHandle_t cuhandle,
                                   cudaStream_t stream) {
 
@@ -328,7 +332,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat,
   dim3 threads(shared_mem_size, shared_mem_size);
 
   for (int k = 0; k < in_maps.size(); k++) {
-    n_active_in_volume = in_maps[k].size();
+    n_active_in_volume = in_maps[k].size(0);
     if (n_active_in_volume == 0)
       continue;
 
@@ -350,7 +354,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat,
             d_in_feat, in_nchannel, curr_num_active,        // D
             d_grad_in_feat,                                 // C
             &d_grad_kernel[k * in_nchannel * out_nchannel], // E
-            in_maps[k].data(), out_maps[k].data());
+            in_maps[k].data<Itype>(), out_maps[k].data<Itype>());
         break;
       case 24:
         matmul2<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
@@ -360,7 +364,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat,
             d_in_feat, in_nchannel, curr_num_active,        // D
             d_grad_in_feat,                                 // C
             &d_grad_kernel[k * in_nchannel * out_nchannel], // E
-            in_maps[k].data(), out_maps[k].data());
+            in_maps[k].data<Itype>(), out_maps[k].data<Itype>());
         break;
       case 16:
         matmul2<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
@@ -370,7 +374,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat,
             d_in_feat, in_nchannel, curr_num_active,        // D
             d_grad_in_feat,                                 // C
             &d_grad_kernel[k * in_nchannel * out_nchannel], // E
-            in_maps[k].data(), out_maps[k].data());
+            in_maps[k].data<Itype>(), out_maps[k].data<Itype>());
         break;
       case 8:
         matmul2<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
@@ -380,7 +384,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat,
             d_in_feat, in_nchannel, curr_num_active,        // D
             d_grad_in_feat,                                 // C
             &d_grad_kernel[k * in_nchannel * out_nchannel], // E
-            in_maps[k].data(), out_maps[k].data());
+            in_maps[k].data<Itype>(), out_maps[k].data<Itype>());
         break;
       }
     }
@@ -392,15 +396,17 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat,
 template void ConvolutionBackwardKernelGPU<float, int32_t>(
     const float *d_in_feat, float *d_grad_in_feat, int in_nchannel,
     const float *d_grad_out_feat, int out_nchannel, const float *d_kernel,
-    float *p_grad_kernel, const pInOutMaps<int32_t> &in_map,
-    const pInOutMaps<int32_t> &out_map, int out_nrows, cublasHandle_t cuhandle,
+    float *p_grad_kernel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    int out_nrows, cublasHandle_t cuhandle,
     cudaStream_t stream);
 
 template void ConvolutionBackwardKernelGPU<double, int32_t>(
     const double *d_in_feat, double *d_grad_in_feat, int in_nchannel,
     const double *d_grad_out_feat, int out_nchannel, const double *d_kernel,
-    double *p_grad_kernel, const pInOutMaps<int32_t> &in_map,
-    const pInOutMaps<int32_t> &out_map, int out_nrows, cublasHandle_t cuhandle,
+    double *p_grad_kernel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    int out_nrows, cublasHandle_t cuhandle,
     cudaStream_t stream);
 
 } // end namespace minkowski
diff --git a/src/convolution.cuh b/src/convolution.cuh
index c29b892c..3da509b1 100644
--- a/src/convolution.cuh
+++ b/src/convolution.cuh
@@ -27,6 +27,7 @@
 
 #include <array>
 #include <vector>
+#include <torch/extension.h>
 
 #include "gpu.cuh"
 #include "math_functions.hpp"
@@ -48,8 +49,7 @@ template <typename Dtype, typename Itype>
 void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel,
                                  Dtype *d_out_feat, int out_nchannel,
                                  const Dtype *d_kernel,
-                                 const pInOutMaps<Itype> &in_map,
-                                 const pInOutMaps<Itype> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                                  int out_nrows, cublasHandle_t cuhandle,
                                  cudaStream_t stream);
 
@@ -58,8 +58,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat,
                                   int in_nchannel, const Dtype *d_grad_out_feat,
                                   int out_nchannel, const Dtype *d_kernel,
                                   Dtype *d_grad_kernel,
-                                  const pInOutMaps<Itype> &in_map,
-                                  const pInOutMaps<Itype> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                                   int out_nrows, cublasHandle_t cuhandle,
                                   cudaStream_t stream);
 } //end namespace minkowski
diff --git a/src/convolution_transpose.cpp b/src/convolution_transpose.cpp
index 627a3178..d175ff2a 100644
--- a/src/convolution_transpose.cpp
+++ b/src/convolution_transpose.cpp
@@ -122,9 +122,9 @@ void ConvolutionTransposeForwardGPU(
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager, bool generate_new_coords) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
-  const auto &in_out = p_coords_manager->getInOutMapsGPU(
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
+  const InOutMapKey map_key = p_coords_manager->getInOutMaps(
       tensor_strides, strides, kernel_sizes, dilations, region_type, offsets,
       py_in_coords_key, py_out_coords_key, true, false, generate_new_coords);
 
@@ -141,7 +141,10 @@ void ConvolutionTransposeForwardGPU(
   ConvolutionForwardKernelGPU<Dtype, int>(
       in_feat.template data<Dtype>(), in_feat.size(1),
       out_feat.template data<Dtype>(), out_feat.size(1),
-      kernel.template data<Dtype>(), in_out.first, in_out.second, out_nrows,
+      kernel.template data<Dtype>(),
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
+      out_nrows,
       handle, at::cuda::getCurrentCUDAStream());
 }
 
@@ -152,8 +155,8 @@ void ConvolutionTransposeBackwardGPU(
     vector<int> strides, vector<int> kernel_sizes, vector<int> dilations,
     int region_type, py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   bool reverse_map = false;
   const InOutMapKey rev_map_key = p_coords_manager->getMapHashKey(
       tensor_strides, strides, kernel_sizes, dilations, region_type,
@@ -177,8 +180,8 @@ void ConvolutionTransposeBackwardGPU(
 
   if (!reverse_map) {
     ASSERT(
-        p_coords_manager->d_in_maps.find(map_key) !=
-            p_coords_manager->d_in_maps.end(),
+        p_coords_manager->in_maps.find(map_key) !=
+            p_coords_manager->in_maps.end(),
         "The in-out map doesn't exist for backward. Did you run forward pass?");
 
     ConvolutionBackwardKernelGPU<Dtype, int>(
@@ -186,13 +189,13 @@ void ConvolutionTransposeBackwardGPU(
         in_feat.size(1), grad_out_feat.template data<Dtype>(),
         grad_out_feat.size(1), kernel.template data<Dtype>(),
         grad_kernel.template data<Dtype>(),
-        p_coords_manager->d_in_maps[map_key],
-        p_coords_manager->d_out_maps[map_key], grad_out_feat.size(0), handle,
+        p_coords_manager->in_maps[map_key],
+        p_coords_manager->out_maps[map_key], grad_out_feat.size(0), handle,
         at::cuda::getCurrentCUDAStream());
   } else {
     ASSERT(
-        p_coords_manager->d_in_maps.find(rev_map_key) !=
-            p_coords_manager->d_in_maps.end(),
+        p_coords_manager->in_maps.find(rev_map_key) !=
+            p_coords_manager->in_maps.end(),
         "The in-out map doesn't exist for backward. Did you run forward pass?");
 
     ConvolutionBackwardKernelGPU<Dtype, int>(
@@ -200,8 +203,8 @@ void ConvolutionTransposeBackwardGPU(
         in_feat.size(1), grad_out_feat.template data<Dtype>(),
         grad_out_feat.size(1), kernel.template data<Dtype>(),
         grad_kernel.template data<Dtype>(),
-        p_coords_manager->d_out_maps[rev_map_key],
-        p_coords_manager->d_in_maps[rev_map_key], grad_out_feat.size(0), handle,
+        p_coords_manager->out_maps[rev_map_key],
+        p_coords_manager->in_maps[rev_map_key], grad_out_feat.size(0), handle,
         at::cuda::getCurrentCUDAStream());
   }
 }
@@ -236,28 +239,28 @@ template void ConvolutionTransposeBackwardCPU<CoordsToIndexMap, double>(
     py::object py_coords_manager);
 
 #ifndef CPU_ONLY
-template void ConvolutionTransposeForwardGPU<CoordsToIndexMap, float>(
+template void ConvolutionTransposeForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor kernel,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager, bool generate_new_coords);
 
-template void ConvolutionTransposeForwardGPU<CoordsToIndexMap, double>(
+template void ConvolutionTransposeForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor kernel,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager, bool generate_new_coords);
 
-template void ConvolutionTransposeBackwardGPU<CoordsToIndexMap, float>(
+template void ConvolutionTransposeBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor kernel, at::Tensor grad_kernel, vector<int> tensor_strides,
     vector<int> strides, vector<int> kernel_sizes, vector<int> dilations,
     int region_type, py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void ConvolutionTransposeBackwardGPU<CoordsToIndexMap, double>(
+template void ConvolutionTransposeBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor kernel, at::Tensor grad_kernel, vector<int> tensor_strides,
     vector<int> strides, vector<int> kernel_sizes, vector<int> dilations,
diff --git a/src/coords_manager.cu b/src/coords_manager.cu.deprecated
similarity index 100%
rename from src/coords_manager.cu
rename to src/coords_manager.cu.deprecated
diff --git a/src/coords_manager.hpp b/src/coords_manager.hpp
index e2ca7ba7..962664d9 100644
--- a/src/coords_manager.hpp
+++ b/src/coords_manager.hpp
@@ -42,11 +42,6 @@
 #include "types.hpp"
 #include "utils.hpp"
 
-#ifndef CPU_ONLY
-#include "gpu_memory_manager.hpp"
-#include <ATen/cuda/CUDAContext.h>
-#endif // CPU_ONLY
-
 namespace minkowski {
 
 using std::begin;
@@ -127,9 +122,6 @@ template <typename MapType = CoordsToIndexMap> class CoordsManager {
       omp_set_dynamic(0);
       omp_set_num_threads(num_threads);
     }
-#ifndef CPU_ONLY
-    gpu_memory_manager = std::make_shared<GPUMemoryManager>(backend);
-#endif
   }
   CoordsManager(int num_threads): CoordsManager(num_threads, PYTORCH) {}
   CoordsManager(): CoordsManager(-1, PYTORCH) {}
@@ -154,14 +146,6 @@ template <typename MapType = CoordsToIndexMap> class CoordsManager {
                vector<int> kernel_sizes, vector<int> dilations, int region_type,
                at::Tensor offsets, py::object py_in_coords_key,
                py::object py_out_coords_key, bool is_transpose, bool is_pool);
-#ifndef CPU_ONLY
-  vector<vector<at::Tensor>>
-  getKernelMapGPU(vector<int> tensor_strides, vector<int> strides,
-                  vector<int> kernel_sizes, vector<int> dilations,
-                  int region_type, at::Tensor offsets,
-                  py::object py_in_coords_key, py::object py_out_coords_key,
-                  bool is_transpose, bool is_pool);
-#endif
   // TODO make this function non-const with ability to generate a new map
   vector<at::Tensor> getCoordsMap(py::object py_in_coords_key,
                                   py::object py_out_coords_key) const;
@@ -268,45 +252,6 @@ template <typename MapType = CoordsToIndexMap> class CoordsManager {
   vector<at::Tensor> getRowIndicesPerBatch(py::object py_in_coords_key,
                                            py::object py_out_coords_key);
 
-#ifndef CPU_ONLY
-  // GPU memory manager
-  std::shared_ptr<GPUMemoryManager> gpu_memory_manager;
-
-  // Keep all in out maps throughout the lifecycle of the coords manager
-  //
-  unordered_map<InOutMapKey, pInOutMaps<int>, InOutMapKeyHash> d_in_maps;
-  unordered_map<InOutMapKey, pInOutMaps<int>, InOutMapKeyHash> d_out_maps;
-
-  const pInOutMaps<int> copyInOutMapToGPU(const InOutMaps<int> &map);
-  void copyInOutMapsToGPU(const InOutMapKey &map_key);
-
-  const pInOutMapsRefPair<int>
-  getInOutMapsGPU(const vector<int> &tensor_strides, const vector<int> &strides,
-                  const vector<int> &kernel_sizes, const vector<int> &dilations,
-                  int region_type, const at::Tensor &offsets,
-                  py::object py_in_coords_key, py::object py_out_coords_key,
-                  bool is_transpose, bool is_pool = false,
-                  bool force_creation = false);
-
-  const pInOutMapsRefPair<int>
-  getOriginInOutMapsGPU(py::object py_in_coords_key,
-                        py::object py_out_coords_key);
-
-  const pInOutMapsRefPair<int>
-  getPruningInOutMapsGPU(at::Tensor use_feat, py::object py_in_coords_key,
-                         py::object py_out_coords_key);
-
-  const pInOutMapsRefPair<int>
-  getUnionInOutMapsGPU(vector<py::object> py_in_coords_keys,
-                       py::object py_out_coords_key);
-
-  void *getScratchGPUMemory(size_t size) {
-    return gpu_memory_manager.get()->tmp_data(size);
-  }
-
-  void clearScratchGPUMemory() { gpu_memory_manager.get()->clear_tmp(); }
-
-#endif // CPU_ONLY
 };     // coordsmanager
 
 } // namespace minkowski
diff --git a/src/gpu_coords_manager.cpp b/src/gpu_coords_manager.cpp
new file mode 100644
index 00000000..6e960225
--- /dev/null
+++ b/src/gpu_coords_manager.cpp
@@ -0,0 +1,1527 @@
+/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
+ * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
+ * of the code.
+ */
+#include "common.hpp"
+#include "region.hpp"
+#include "utils.hpp"
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace minkowski {
+
+/*
+ * Given tensor_stride_src and tensor_stride_dst, find the respective coord_maps
+ * and return the indices of the coord_map_ind in coord_map_dst
+ */
+template <typename MapType>
+vector<vector<at::Tensor>> GPUCoordsManager<MapType>::getKernelMap(
+    const vector<int>& tensor_strides, const vector<int>& strides,
+    const vector<int>& kernel_sizes,
+    const vector<int>& dilations, int region_type, at::Tensor offsets,
+    py::object py_in_coords_key, py::object py_out_coords_key,
+    bool is_transpose, bool is_pool) {
+  // WARNING: This function will not work properly with custon region types.
+  ASSERT(region_type != 2,
+         "Currently, it does not support the custom region type.");
+  /*
+  const InOutMapKey map_key = getMapHashKey(
+      tensor_strides, strides, kernel_sizes, dilations, region_type,
+      py_in_coords_key, py_out_coords_key, is_transpose, is_pool);
+
+  const auto &in_map_iter = in_maps.find(map_key);
+  */
+
+//  if (in_map_iter == in_maps.end()) {
+  const InOutMapKey map_key = getInOutMaps(tensor_strides, strides, kernel_sizes, dilations, region_type,
+                 offsets, py_in_coords_key, py_out_coords_key, false);
+//    ASSERT(in_maps.find(map_key) != in_maps.end(), "Kernel map not found.");
+//  }
+
+  return {in_maps[map_key], out_maps[map_key]};
+}
+
+template <typename MapType>
+vector<at::Tensor>
+GPUCoordsManager<MapType>::getCoordsMap(py::object py_in_coords_key,
+                                     py::object py_out_coords_key) const {
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  const uint64_t out_coords_key = p_out_coords_key->getKey();
+
+  const auto in_map_iter = coords_maps.find(in_coords_key);
+  const auto out_map_iter = coords_maps.find(out_coords_key);
+
+  ASSERT(in_map_iter != coords_maps.end(), "Input coords not found at",
+         to_string(in_coords_key));
+  ASSERT(out_map_iter != coords_maps.end(), "Output coords not found at",
+         to_string(out_coords_key));
+
+  const auto &out_tensor_strides = p_out_coords_key->getTensorStride();
+
+  const auto nrows = in_map_iter->second->nrows;
+
+  at::Tensor in =
+      torch::empty({static_cast<int>(nrows + 1)},
+                   torch::TensorOptions().dtype(torch::kInt32));
+  at::Tensor out =
+      torch::empty({static_cast<int>(nrows)},
+                   torch::TensorOptions().dtype(torch::kInt32));
+
+  int* p_in = in.data<int>();
+  int* p_out = out.data<int>();
+
+  out_map_iter->second->stride_search(in_map_iter->second,
+                                     p_in, p_out,
+                                     out_tensor_strides,
+                                     nrows);
+  int size = *(p_in + nrows);
+  in.resize_({size});
+  out.resize_({size});
+  return {in, out};
+}
+
+template <typename MapType>
+uint64_t
+GPUCoordsManager<MapType>::getCoordsKey(const vector<int> &tensor_strides) const {
+  auto tensor_stride_hash = hash_vec<vector<int>>(tensor_strides);
+  ASSERT(coords_maps.find(tensor_stride_hash) != coords_maps.end(),
+         "The coord map doesn't exist for the given tensor strides ",
+         "tensor_stride: ", ArrToString(tensor_strides));
+  return tensor_stride_hash;
+}
+
+template <typename MapType>
+bool GPUCoordsManager<MapType>::existsCoordsKey(const uint64_t coords_key) const {
+  return coords_maps.find(coords_key) != coords_maps.end();
+}
+
+template <typename MapType>
+bool GPUCoordsManager<MapType>::existsCoordsKey(py::object py_coords_key) const {
+  CoordsKey *p_coords_key = py_coords_key.cast<CoordsKey *>();
+  return existsCoordsKey(p_coords_key->getKey());
+}
+
+template <typename MapType>
+uint64_t GPUCoordsManager<MapType>::getRandomCoordsKey() {
+  uint64_t coords_key = random();
+  while (coords_maps.find(coords_key) != coords_maps.end())
+    coords_key = random();
+  return coords_key;
+}
+
+template <typename MapType>
+int GPUCoordsManager<MapType>::getCoordsSize(const uint64_t coords_key) const {
+  const auto &coords_map_iter = coords_maps.find(coords_key);
+  ASSERT(coords_map_iter != coords_maps.end(),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(coords_key), ".");
+  return coords_map_iter->second->size();
+}
+
+template <typename MapType>
+int GPUCoordsManager<MapType>::getCoordsSize(py::object py_coords_key) const {
+  CoordsKey *p_coords_key = py_coords_key.cast<CoordsKey *>();
+  return getCoordsSize(p_coords_key->getKey());
+}
+
+template <typename MapType>
+void GPUCoordsManager<MapType>::getCoords(at::Tensor coords,
+                                       py::object py_coords_key) const {
+  CoordsKey *p_coords_key = py_coords_key.cast<CoordsKey *>();
+  const uint64_t coords_key = p_coords_key->getKey();
+
+  // initialize
+  const auto &coords_map_iter = coords_maps.find(coords_key);
+  ASSERT(coords_map_iter != coords_maps.end(),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(coords_key), ".");
+  //const GPUCoordsMap<MapType> &coordmap = coords_map_iter->second;
+//  const auto& coordmap = coords_map_iter->second;
+  /*
+  int nrows = coordmap->nrows;
+  int ncols = coordmap->ncols;
+  */
+  int nrows = coords_map_iter->second->nrows;
+  int ncols = coords_map_iter->second->ncols;
+  coords.resize_({nrows, ncols});
+  int *p_coords = coords.data<int>();
+
+  //coordmap->get_coords(p_coords, nrows);
+  coords_map_iter->second->get_coords(p_coords, nrows);
+}
+
+template <typename MapType>
+void GPUCoordsManager<MapType>::setOriginCoordsKey(py::object py_coords_key) {
+  CoordsKey *p_coords_key = py_coords_key.cast<CoordsKey *>();
+  const int D = p_coords_key->getDimension();
+  ASSERT(D > 0, "Invalid dimension: ", D);
+  if (!p_coords_key->key_set) {
+    p_coords_key->setKey(createOriginCoords(D));
+    const vector<int> zero_vec(D, 0);
+    p_coords_key->setTensorStride(zero_vec);
+  } else {
+    auto coords_key = p_coords_key->getKey();
+    auto origin_key = createOriginCoords(D);
+    ASSERT(coords_key == origin_key, "Invalid key: ", to_string(coords_key),
+           " != Origin key: ", to_string(origin_key));
+  }
+}
+
+/*******************************
+ * Initialization
+ *******************************/
+
+/*
+ * coords: coordinates in IntTensor
+ * mapping: output mapping in IntTensor
+ * tensor_strides: current tensor strides this coords will be initializeds
+ * force_creation: even when there's a duplicate coords with the same tensor
+ *                 strides.
+ * force_remap: if there's duplicate coords, remap
+ * allow_duplicate_coords: create map when there are duplicates in the
+ * coordinates
+ */
+template <typename MapType>
+uint64_t GPUCoordsManager<MapType>::initializeCoords(
+    at::Tensor coords, at::Tensor mapping, at::Tensor inverse_mapping,
+    const vector<int> &tensor_strides, const bool force_creation,
+    const bool force_remap, const bool allow_duplicate_coords,
+    const bool return_inverse) {
+  device = coords.device();
+  const int nrows = coords.size(0);
+  const int ncols = coords.size(1);
+  const int D = ncols - 1;
+
+  // Basic assertions
+  ASSERT(force_creation == true, "force_creation must be true");
+  ASSERT(D == tensor_strides.size(), "The coordinate dimension (ncols - 1) ",
+         to_string(D),
+         " must match the size of tensor stride: ", ArrToString(tensor_strides),
+         ".");
+
+  uint64_t key = hash_vec(tensor_strides);
+
+  if (coords_maps.find(key) != coords_maps.end()) {
+    // If force creation, set a random key that doesn't exist
+    if (force_creation) {
+      key = getRandomCoordsKey();
+    } else {
+      ASSERT(false, "The coord map already exists for the given tensor stride ",
+             "tensor_stride: ", ArrToString(tensor_strides),
+             "For more information, please refer to the SparseTensor creation "
+             "documentation available at:"
+             "https://nvidia.github.io/MinkowskiEngine/sparse_tensor.html");
+    }
+  }
+
+  // Create the concurrent coords map
+  mapping.resize_(static_cast<int>(nrows)).to(device);
+  inverse_mapping.resize_(static_cast<int>(nrows)).to(device);
+  int* p_coords = coords.data<int>();
+  int* p_mapping = mapping.data<int>();
+  int* p_inverse_mapping = inverse_mapping.data<int>();
+  float duplicate_factor = 0.1;
+  coords_maps[key] = std::make_shared<GPUCoordsMap<MapType>>(nrows, duplicate_factor);
+
+  ASSERT(force_remap == true,
+         "Please use cpu version when force_remap == false");
+
+  auto coords_map_size = coords_maps[key]->initialize_batch(
+      p_coords, p_mapping, p_inverse_mapping,
+      nrows, ncols, force_remap, return_inverse);
+
+  min_nrows = coords_map_size;
+  min_coords_key = key;
+
+  if (!allow_duplicate_coords && !force_remap) {
+    ASSERT(nrows == coords_map_size, "Duplicate coordinates found. ",
+           "Number of input coords:", nrows,
+           " != Number of unique coords:", coords_map_size,
+           "If the duplication was intentional, set force_remap to true."
+           "For more information, please refer to the SparseTensor creation "
+           "documentation available at: "
+           "https://nvidia.github.io/MinkowskiEngine/sparse_tensor.html");
+  }
+
+  // When remapping, return the mapping to pytorch.
+  if (force_remap || return_inverse) {
+//    ASSERT(mapping.dtype() == torch::kInt64,
+//           "Mapping must be a torch::LongTensor");
+    mapping.resize_({coords_map_size});
+  }
+
+  if (return_inverse) {
+//    ASSERT(inverse_mapping.dtype() == torch::kInt64,
+//           "Inverse Mapping must be a torch::LongTensor");
+    ASSERT(inverse_mapping.size(0) == nrows,
+           "inverse_mapping's size must equal to nrows");
+  }
+
+  return key;
+}
+
+template <typename MapType>
+uint64_t GPUCoordsManager<MapType>::initializeCoords(
+    at::Tensor coords, at::Tensor mapping, at::Tensor inverse_mapping,
+    py::object py_coords_key, const bool force_creation, const bool force_remap,
+    const bool allow_duplicate_coords, const bool return_inverse) {
+  CoordsKey *p_coords_key = py_coords_key.cast<CoordsKey *>();
+
+  const uint64_t in_coords_key = initializeCoords(
+      coords, mapping, inverse_mapping, p_coords_key->getTensorStride(),
+      force_creation, force_remap, allow_duplicate_coords, return_inverse);
+
+  // Tensor strides initialized on the python side.
+  p_coords_key->setKey(in_coords_key);
+
+  return in_coords_key;
+}
+
+/*********************************/
+template <typename MapType>
+uint64_t GPUCoordsManager<MapType>::createStridedCoords(
+    uint64_t coords_key, const vector<int> &tensor_strides,
+    const vector<int> &strides, bool force_creation) {
+  // Basic assertions
+  ASSERT(existsCoordsKey(coords_key),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(coords_key), ".");
+
+  const vector<int> out_tensor_strides =
+      computeOutTensorStride(tensor_strides, strides, false);
+
+  const int D = coords_maps[coords_key]->ncols - 1;
+  ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ",
+         "GPUCoordsManager dimension: ", to_string(D),
+         ", tensor_strides dimension: ", to_string(tensor_strides.size()));
+
+  uint64_t out_coords_key = 0;
+  const bool is_identity =
+      std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; });
+
+  if (is_identity) {
+    out_coords_key = coords_key;
+  } else {
+
+    // tensor_strides.size() == strides.size() on computeOutTensorStride
+    out_coords_key = hash_vec(out_tensor_strides);
+
+    // If force creationg, get a random key.
+    // ElseIf the coordinates already exists, return the key.
+    if (force_creation) {
+      if (existsCoordsKey(out_coords_key))
+        out_coords_key = getRandomCoordsKey();
+    } else if (existsCoordsKey(out_coords_key)) {
+      return out_coords_key;
+    }
+
+    // Create a strided coords map
+    int duplicate_factor = 1;
+    for (auto stride : strides) duplicate_factor *= stride;
+    duplicate_factor = 1.0 / duplicate_factor;
+    const auto nrows = coords_maps[coords_key]->nrows;
+    coords_maps[out_coords_key] = std::make_shared<GPUCoordsMap<MapType>>(nrows, duplicate_factor);
+    auto out_nrows = coords_maps[out_coords_key]->stride_insert(coords_maps[coords_key],
+                                                               out_tensor_strides,
+                                                               nrows);
+    if (out_nrows < min_nrows) {
+      min_nrows = out_nrows;
+      min_coords_key = out_coords_key;
+    }
+  }
+
+  return out_coords_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::getStridedInOutMaps(
+    py::object py_in_coords_key, py::object py_out_coords_key,
+    const vector<int>& tensor_strides, const vector<int>& strides,
+    const vector<int>& kernel_sizes, const vector<int>& dilations, int region_type,
+    bool is_transpose, bool is_pool,
+    bool force_creation) {
+
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  uint64_t out_coords_key = 0;
+
+  /*
+  if (!p_out_coords_key->tensor_stride_set) {
+    p_out_coords_key->setTensorStride(tensor_strides);
+    p_out_coords_key->up_stride(strides);
+  }
+  */
+
+  // Basic assertions
+  ASSERT(existsCoordsKey(in_coords_key),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(in_coords_key), ".");
+
+  const int D = coords_maps[in_coords_key]->ncols - 1;
+  ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ",
+         "GPUCoordsManager dimension: ", to_string(D),
+         ", tensor_strides dimension: ", to_string(tensor_strides.size()));
+
+  const vector<int> out_tensor_strides =
+      computeOutTensorStride(tensor_strides, strides, is_transpose);
+
+  const bool is_identity =
+      std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; });
+
+  if (is_identity) {
+    ASSERT(!p_out_coords_key->isKeySet() ||
+            p_out_coords_key->getKey() == p_in_coords_key->getKey(),
+           "Be aware of coords_key overwrite leakage");
+    out_coords_key = in_coords_key;
+    p_out_coords_key->setKey(out_coords_key);
+    if (!p_out_coords_key->tensor_stride_set) {
+      p_out_coords_key->setTensorStride(tensor_strides);
+      p_out_coords_key->up_stride(strides);
+    }
+  } else if (force_creation) {
+    return createStridedInOutMaps(
+        py_in_coords_key, py_out_coords_key,
+        tensor_strides, strides,
+        kernel_sizes, dilations, region_type,
+        is_transpose, is_pool,
+        true);
+  } else if (p_out_coords_key->isKeySet()) {
+    out_coords_key = p_out_coords_key->getKey();
+  } else {
+    out_coords_key = hash_vec(out_tensor_strides);
+    if (!existsCoordsKey(out_coords_key)) {
+      return createStridedInOutMaps(
+          py_in_coords_key, py_out_coords_key,
+          tensor_strides, strides,
+          kernel_sizes, dilations, region_type,
+          is_transpose, is_pool,
+          false);
+    }
+  }
+
+  const InOutMapKey map_key = getMapHashKey(
+      tensor_strides, strides, kernel_sizes, dilations, region_type,
+      py_in_coords_key, py_out_coords_key, is_transpose, is_pool);
+
+  if (in_maps.find(map_key) != in_maps.end()) return map_key;
+
+  const auto nrows = coords_maps[in_coords_key]->nrows;
+
+  vector<at::Tensor> th_ins(1,
+      torch::empty({static_cast<int>(nrows + 1)},
+                   torch::TensorOptions().dtype(torch::kInt32)));
+  vector<at::Tensor> th_outs(1,
+      torch::empty({static_cast<int>(nrows)},
+                   torch::TensorOptions().dtype(torch::kInt32)));
+
+  int* p_in = th_ins[0].data<int>();
+  int* p_out = th_outs[0].data<int>();
+
+  coords_maps[out_coords_key]->stride_search(coords_maps[in_coords_key],
+                                            p_in, p_out,
+                                            out_tensor_strides,
+                                            nrows);
+  int size = *(p_in + nrows);
+  th_ins[0].resize_({size});
+  th_outs[0].resize_({size});
+  in_maps[map_key] = move(th_ins);
+  out_maps[map_key] = move(th_outs);
+  return map_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::createStridedInOutMaps(
+    py::object py_in_coords_key, py::object py_out_coords_key,
+    const vector<int> &tensor_strides,
+    const vector<int> &strides,
+    vector<int> kernel_sizes, vector<int> dilations, int region_type,
+    bool is_transpose, bool is_pool,
+    bool force_creation) {
+
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  uint64_t out_coords_key = 0;
+
+  // Basic assertions
+  ASSERT(existsCoordsKey(in_coords_key),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(in_coords_key), ".");
+
+  const int D = coords_maps[in_coords_key]->ncols - 1;
+  ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ",
+         "GPUCoordsManager dimension: ", to_string(D),
+         ", tensor_strides dimension: ", to_string(tensor_strides.size()));
+
+  const bool is_identity =
+      std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; });
+
+  ASSERT(is_identity == false,
+         "Please check is_identity in getStridedInOutMaps");
+
+  const vector<int> out_tensor_strides =
+      computeOutTensorStride(tensor_strides, strides, is_transpose);
+
+  out_coords_key = hash_vec(out_tensor_strides);
+  if (force_creation) {
+    if (existsCoordsKey(out_coords_key))
+      out_coords_key = getRandomCoordsKey();
+  } else {
+    ASSERT(!existsCoordsKey(out_coords_key),
+           "createX will always come from getX, getX has handled this condition");
+  }
+
+  p_out_coords_key->setKey(out_coords_key);
+
+  if (!p_out_coords_key->tensor_stride_set) {
+    p_out_coords_key->setTensorStride(tensor_strides);
+    p_out_coords_key->up_stride(strides);
+  }
+
+  const InOutMapKey map_key = getMapHashKey(
+      tensor_strides, strides, kernel_sizes, dilations, region_type,
+      py_in_coords_key, py_out_coords_key, is_transpose, is_pool);
+
+//  if (in_maps.find(map_key) != in_maps.end()) return;
+  ASSERT(in_maps.find(map_key) == in_maps.end(),
+         "out_coords_key is new, ins/outs maps have to be generated.");
+
+  const auto nrows = coords_maps[in_coords_key]->nrows;
+
+  vector<at::Tensor> th_ins(1,
+      torch::empty({static_cast<int>(nrows)},
+                   torch::TensorOptions().dtype(torch::kInt32)));
+  vector<at::Tensor> th_outs(1,
+      torch::empty({static_cast<int>(nrows)},
+                   torch::TensorOptions().dtype(torch::kInt32)));
+
+  int* p_in = th_ins[0].data<int>();
+  int* p_out = th_outs[0].data<int>();
+
+  // Create a strided coords map
+  int duplicate_factor = 1;
+  for (auto stride : strides) duplicate_factor *= stride;
+  duplicate_factor = 1.0 / duplicate_factor;
+  coords_maps[out_coords_key] = std::make_shared<GPUCoordsMap<MapType>>(nrows, duplicate_factor);
+  auto out_nrows = coords_maps[out_coords_key]->stride_insert_search(coords_maps[in_coords_key],
+                                                   p_in, p_out,
+                                                   out_tensor_strides,
+                                                   nrows);
+  if (out_nrows < min_nrows) {
+    min_nrows = out_nrows;
+    min_coords_key = out_coords_key;
+  }
+  in_maps[map_key] = move(th_ins);
+  out_maps[map_key] = move(th_outs);
+  return map_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::getTransposedStridedRegionInOutMaps(
+    py::object py_in_coords_key, py::object py_out_coords_key,
+    const vector<int>& tensor_strides,
+    const vector<int>& strides, const vector<int>& kernel_sizes, const vector<int>& dilations,
+    int region_type,
+    bool is_transpose, bool is_pool,
+    at::Tensor offsets,
+    bool force_creation) {
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  uint64_t out_coords_key = 0;
+
+  // Basic assertions
+  ASSERT(existsCoordsKey(in_coords_key),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(in_coords_key), ".");
+
+  const int D = coords_maps[in_coords_key]->ncols - 1;
+  ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ",
+         "GPUCoordsManager dimension: ", to_string(D),
+         ", tensor_strides dimension: ", to_string(tensor_strides.size()));
+
+  const vector<int> out_tensor_strides =
+      computeOutTensorStride(tensor_strides, strides, is_transpose);
+
+  const bool is_identity =
+      std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; });
+
+  if (is_identity) {
+    ASSERT(!p_out_coords_key->isKeySet() ||
+            p_out_coords_key->getKey() == p_in_coords_key->getKey(),
+           "Be aware of coords_key overwrite leakage");
+    out_coords_key = in_coords_key;
+    p_out_coords_key->setKey(out_coords_key);
+    if (!p_out_coords_key->tensor_stride_set) {
+      p_out_coords_key->setTensorStride(tensor_strides);
+      p_out_coords_key->up_stride(strides);
+    }
+  } else if (force_creation) {
+    return createTransposedStridedRegionInOutMaps(
+        py_in_coords_key, py_out_coords_key,
+        tensor_strides, strides,
+        kernel_sizes, dilations, region_type,
+        is_transpose, is_pool,
+        offsets,
+        true);
+  } else if (p_out_coords_key->isKeySet()) {
+    out_coords_key = p_out_coords_key->getKey();
+  } else {
+    out_coords_key = hash_vec(out_tensor_strides);
+    if (!existsCoordsKey(out_coords_key)) {
+      return createTransposedStridedRegionInOutMaps(
+          py_in_coords_key, py_out_coords_key,
+          tensor_strides, strides,
+          kernel_sizes, dilations, region_type,
+          is_transpose, is_pool,
+          offsets,
+          false);
+    }
+  }
+
+  const InOutMapKey map_key = getMapHashKey(
+      tensor_strides, strides, kernel_sizes, dilations, region_type,
+      py_in_coords_key, py_out_coords_key, is_transpose, is_pool);
+
+  if (in_maps.find(map_key) != in_maps.end()) return map_key;
+
+  const auto nrows = coords_maps[in_coords_key]->nrows;
+
+  // Create transposed coords map
+  Region region = Region(out_tensor_strides, kernel_sizes, dilations,
+                         region_type, offsets.data<int>(), offsets.size(0));
+
+//  in_maps[map_key] = vector<at::Tensor>(region.size(),
+  vector<at::Tensor> th_ins(region.size(),
+      torch::empty({static_cast<int>(nrows + 1)},
+                   torch::TensorOptions().dtype(torch::kInt32)));
+//  out_maps[map_key] = vector<at::Tensor>(region.size(),
+  vector<at::Tensor> th_outs(region.size(),
+      torch::empty({static_cast<int>(nrows)},
+                   torch::TensorOptions().dtype(torch::kInt32)));
+
+  vector<int*> p_ins(region.size());
+  vector<int*> p_outs(region.size());
+  for (size_t c = 0; c != region.size(); ++c) {
+    p_ins[c] = th_ins[c].data<int>();
+    p_outs[c] = th_outs[c].data<int>();
+  }
+
+  coords_maps[out_coords_key]->region_search(coords_maps[in_coords_key],
+                                            p_ins, p_outs,
+                                            region, nrows);
+  for (size_t c = 0; c != region.size(); ++c) {
+    int size = *(p_ins[c] + nrows);
+    th_ins[c].resize_({size});
+    th_outs[c].resize_({size});
+  }
+  in_maps[map_key] = move(th_ins);
+  out_maps[map_key] = move(th_outs);
+  return map_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::createTransposedStridedRegionInOutMaps(
+    py::object py_in_coords_key, py::object py_out_coords_key,
+    const vector<int>& tensor_strides,
+    const vector<int>& strides, const vector<int>& kernel_sizes, const vector<int>& dilations,
+    int region_type,
+    bool is_transpose, bool is_pool,
+    at::Tensor offsets, bool force_creation) {
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  uint64_t out_coords_key = 0;
+
+  // Basic assertions
+  ASSERT(existsCoordsKey(in_coords_key),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(in_coords_key), ".");
+
+  const int D = coords_maps[in_coords_key]->ncols - 1;
+  ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ",
+         "GPUCoordsManager dimension: ", to_string(D),
+         ", tensor_strides dimension: ", to_string(tensor_strides.size()));
+
+  const bool is_identity =
+      std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; });
+
+  ASSERT(is_identity == false,
+         "Please check is_identity in getStridedInOutMaps");
+
+  const vector<int> out_tensor_strides =
+      computeOutTensorStride(tensor_strides, strides, is_transpose);
+
+  // Set the out_coords_key and return if a key already exists.
+  out_coords_key = hash_vec(out_tensor_strides);
+  if (force_creation) {
+    // set a random coords key if force creation is set
+    if (existsCoordsKey(out_coords_key))
+      out_coords_key = getRandomCoordsKey();
+  } else {
+    ASSERT(!existsCoordsKey(out_coords_key),
+           "createX will always come from getX, getX has handled this condition");
+  }
+
+  p_out_coords_key->setKey(out_coords_key);
+
+  if (!p_out_coords_key->tensor_stride_set) {
+    p_out_coords_key->setTensorStride(tensor_strides);
+    p_out_coords_key->up_stride(strides);
+  }
+
+  const InOutMapKey map_key = getMapHashKey(
+      tensor_strides, strides, kernel_sizes, dilations, region_type,
+      py_in_coords_key, py_out_coords_key, is_transpose, is_pool);
+
+//  if (in_maps.find(map_key) != in_maps.end()) return;
+  ASSERT(in_maps.find(map_key) == in_maps.end(),
+         "out_coords_key is new, ins/outs maps have to be generated.");
+
+  const auto nrows = coords_maps[in_coords_key]->nrows;
+
+  // Create transposed coords map
+  Region region = Region(out_tensor_strides, kernel_sizes, dilations,
+                         region_type, offsets.data<int>(), offsets.size(0));
+
+//  in_maps[map_key] = vector<at::Tensor>(region.size(),
+  vector<at::Tensor> th_ins(region.size(),
+      torch::empty({static_cast<int>(nrows)},
+                   torch::TensorOptions().dtype(torch::kInt32)));
+//  out_maps[map_key] = vector<at::Tensor>(region.size(),
+  vector<at::Tensor> th_outs(region.size(),
+      torch::empty({static_cast<int>(nrows)},
+                   torch::TensorOptions().dtype(torch::kInt32)));
+
+  vector<int*> p_ins(region.size());
+  vector<int*> p_outs(region.size());
+  for (size_t c = 0; c != region.size(); ++c) {
+    p_ins[c] = th_ins[c].data<int>();
+    p_outs[c] = th_outs[c].data<int>();
+  }
+
+  float duplicate_factor = 1.0;
+  for (auto stride : strides) duplicate_factor *= stride;
+  coords_maps[out_coords_key] = std::make_shared<GPUCoordsMap<MapType>>(
+                                    nrows,
+                                    duplicate_factor);
+  auto out_nrows = coords_maps[out_coords_key]->region_insert_search(
+                                    coords_maps[in_coords_key],
+                                    p_ins, p_outs,
+                                    region, nrows);
+  if (out_nrows < min_nrows) {
+    min_nrows = out_nrows;
+    min_coords_key = out_coords_key;
+  }
+  in_maps[map_key] = move(th_ins);
+  out_maps[map_key] = move(th_outs);
+  return map_key;
+}
+
+template <typename MapType>
+uint64_t GPUCoordsManager<MapType>::createTransposedStridedRegionCoords(
+    uint64_t coords_key, const vector<int> &tensor_strides,
+    const vector<int> &strides, vector<int> kernel_sizes, vector<int> dilations,
+    int region_type, at::Tensor offsets, bool force_creation) {
+  const vector<int> out_tensor_strides =
+      computeOutTensorStride(tensor_strides, strides, true /* is_transpose */);
+
+  // Basic assertions
+  ASSERT(existsCoordsKey(coords_key),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(coords_key), ".");
+
+  const int D = coords_maps[coords_key]->ncols - 1;
+  ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ",
+         "GPUCoordsManager dimension: ", to_string(D),
+         ", tensor_strides dimension: ", to_string(tensor_strides.size()));
+
+  // Set the out_coords_key and return if a key already exists.
+  uint64_t out_coords_key = hash_vec(out_tensor_strides);
+  if (force_creation) {
+    // set a random coords key if force creation is set
+    if (existsCoordsKey(out_coords_key))
+      out_coords_key = getRandomCoordsKey();
+  } else if (existsCoordsKey(out_coords_key)) {
+    // Returnn if not force_creation and the key exists
+    return out_coords_key;
+  }
+
+  // Create transposed coords map
+  Region region = Region(out_tensor_strides, kernel_sizes, dilations,
+                         region_type, offsets.data<int>(), offsets.size(0));
+
+  const int nrows = coords_maps[coords_key]->nrows;
+  float duplicate_factor = 1.0;
+  for (auto stride : strides) duplicate_factor *= stride;
+  coords_maps[out_coords_key] = std::make_shared<GPUCoordsMap<MapType>>(nrows,
+                                                   duplicate_factor);
+  auto out_nrows = coords_maps[out_coords_key]->region_insert(
+                              coords_maps[coords_key], region, nrows);
+
+  if (out_nrows < min_nrows) {
+    min_nrows = out_nrows;
+    min_coords_key = out_coords_key;
+  }
+  return out_coords_key;
+}
+
+template <typename MapType>
+uint64_t GPUCoordsManager<MapType>::createOriginCoords(const int D) {
+  const vector<int> zero_tensor_strides(D, 0);
+  const uint64_t out_coords_key = hash_vec(zero_tensor_strides);
+  // If the coordinates already exists, return the key.
+  if (existsCoordsKey(out_coords_key))
+    return out_coords_key;
+
+  coords_maps[out_coords_key] = std::make_shared<GPUCoordsMap<MapType>>(1, 1.0);
+  // TODO(ljm): implement batch_insert
+  batch_size = coords_maps[out_coords_key]->batch_insert(coords_maps[min_coords_key],
+                                                        coords_maps[min_coords_key]->nrows);
+  if (batch_size < min_nrows) {
+    min_nrows = batch_size;
+    min_coords_key = out_coords_key;
+  }
+  return out_coords_key;
+}
+
+template <typename MapType>
+long int GPUCoordsManager<MapType>::getBatchSize() {
+  if (batch_size == -1) createOriginCoords(D);
+  return batch_size;
+}
+
+template <typename MapType>
+const InOutMapKey GPUCoordsManager<MapType>::getMapHashKey(
+    vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
+    vector<int> dilations, int region_type, py::object py_in_coords_key,
+    py::object py_out_coords_key, bool is_transpose, bool is_pool) const {
+  const int D = tensor_strides.size();
+  ASSERT(D == tensor_strides.size() and D == strides.size() and
+             D == kernel_sizes.size() and D == dilations.size(),
+         "Size mismatch. tensor_strides: ", tensor_strides.size(),
+         ", strides: ", strides.size(), ", kernel_sizes: ", kernel_sizes.size(),
+         ", dilations: ", dilations.size());
+
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  const uint64_t out_coords_key = p_out_coords_key->getKey();
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  const uint64_t stride_hash = hash_vec(strides);
+  const uint64_t kernel_size_hash = hash_vec(kernel_sizes);
+  const uint64_t dilation_hash = hash_vec(dilations);
+  const InOutMapKey map_key = {
+      in_coords_key, out_coords_key,        stride_hash,  kernel_size_hash,
+      dilation_hash, (uint64_t)region_type, is_transpose, is_pool};
+
+  return map_key;
+}
+
+template <typename MapType>
+const InOutMapKey GPUCoordsManager<MapType>::getOriginMapHashKey(
+    py::object py_in_coords_key, py::object py_out_coords_key) const {
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  ASSERT(
+      p_in_coords_key->key_set and p_out_coords_key->key_set,
+      "Key is not set. in_coords_key: ", to_string(p_in_coords_key->getKey()),
+      ", out_coords_key: ", to_string(p_out_coords_key->getKey()));
+
+  const int D = p_in_coords_key->getDimension();
+
+  const uint64_t out_coords_key = p_out_coords_key->getKey();
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  const vector<int> zero_vec(D, 0);
+  const uint64_t zero_hash = hash_vec(zero_vec);
+  const InOutMapKey map_key = {
+      in_coords_key, out_coords_key, zero_hash, zero_hash, zero_hash, 0, false,
+      true};
+  return map_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::getUnionMapHashKey(vector<py::object> py_in_coords_keys,
+                                           py::object py_out_coords_key) const {
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  ASSERT(py_in_coords_keys.size() > 1, "Number of input coords must be > 1");
+  vector<CoordsKey *> p_in_coords_keys;
+  // We use sum of coords key (even with overflow, it will be unique with high
+  // prob). We use sum to make the key invariant to the order of the keys.
+  uint64_t sum_in_coords_key = 0;
+  CoordsKey *p_in_coords_key = py_in_coords_keys[0].cast<CoordsKey *>();
+  for (auto &py_in_coords_key : py_in_coords_keys) {
+    p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+    const uint64_t in_coords_key = p_in_coords_key->getKey();
+    ASSERT(existsCoordsKey(in_coords_key),
+           "The coord map doesn't exist for the given coords_key: ",
+           to_string(in_coords_key), ".");
+    sum_in_coords_key += in_coords_key;
+  }
+
+  ASSERT(p_out_coords_key->key_set, "Key is not set. out_coords_key: ",
+         to_string(p_out_coords_key->getKey()));
+
+  const uint64_t out_coords_key = p_out_coords_key->getKey();
+  const vector<int> zero_vec(p_in_coords_key->getDimension(), 0);
+  const uint64_t zero_hash = hash_vec(zero_vec);
+  InOutMapKey map_key = {sum_in_coords_key,
+                         out_coords_key,
+                         zero_hash,
+                         zero_hash,
+                         zero_hash,
+                         0,
+                         false,
+                         true};
+  return map_key;
+}
+/**
+ * Entry function for coords map generation and the associated kernel maps.
+ */
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::getInOutMaps(
+    const vector<int> &tensor_strides, const vector<int> &strides,
+    const vector<int> &kernel_sizes, const vector<int> &dilations,
+    int region_type, const at::Tensor &offsets, py::object py_in_coords_key,
+    py::object py_out_coords_key, bool is_transpose, bool is_pool,
+    bool force_creation) {
+  //
+  // Warning(ljm): In the GPU version, when `is_transpose == True`,
+  // the `Filp` ins/outs maps generation in CPU is not used.
+  // It is same as the `non-Flip` version except when `is_pool == True`
+  // and `kernel_size` is even, there will be a little bit of difference
+  // of the ins/outs maps. But it does not affect the math meaning,
+  // just like the customized `region` based `Sparse Convolution`
+  // compared with the classic version similar as the normal
+  // `Convolution` in torch as `spconv` does.
+  // By the way, the `non-Flip` GPU implementation could merge the
+  // insert and search operation which reduce the region iteration
+  // times from two to one which should be a big optimization.
+  //
+  // Another remark:
+  // By the logic of the implementation in CPU version,
+  // the following ins/outs flip cache will never hit.
+  //  in_maps[map_key] = out_maps[tmp_map_key];
+  //  out_maps[map_key] = in_maps[tmp_map_key];
+  // So we do not check it in GPU version.
+  // By the way, if it hits, the `non-Flip` implementation in GPU version
+  // will catch it automatically. Although it will never hit.
+  //
+  const int D = tensor_strides.size();
+  ASSERT(D == tensor_strides.size() and D == strides.size() and
+             D == kernel_sizes.size() and D == dilations.size(),
+         "Size mismatch. tensor_strides: ", tensor_strides.size(),
+         ", strides: ", strides.size(), ", kernel_sizes: ", kernel_sizes.size(),
+         ", dilations: ", dilations.size());
+  ASSERT(std::all_of(tensor_strides.begin(), tensor_strides.end(),
+                     [](int k) { return k > 0; }),
+         "Invalid tensor_strides: ", ArrToString(tensor_strides),
+         " Tensor strides must be positive integers.");
+
+    if (!is_transpose) {
+      // TODO(ljm): track the update in cpu version
+      // TODO: even numbered kernel size to use region_type 0
+      if (is_pool && (strides == kernel_sizes)) {
+        return getStridedInOutMaps(
+            py_in_coords_key, py_out_coords_key,
+            tensor_strides, strides,
+            kernel_sizes, dilations, region_type,
+            is_transpose, is_pool,
+            force_creation);
+      } else {
+        CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+        CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+        // Will return the in_coords_key if strides == 1.
+        auto out_coords_key = createStridedCoords(
+            p_in_coords_key->getKey(), tensor_strides, strides, force_creation);
+
+        p_out_coords_key->setKey(out_coords_key);
+        if (!p_out_coords_key->tensor_stride_set) {
+          p_out_coords_key->setTensorStride(tensor_strides);
+          p_out_coords_key->up_stride(strides);
+        }
+
+        // use Tranposed to generate Non-Tranpose
+        // Flip is needed.
+        // But, Filp equals to Non-Flip when kernel is symmetric.
+        // And, it has little differece when kernel is non-symmetric.
+        return getTransposedStridedRegionInOutMaps(
+            py_in_coords_key, py_out_coords_key,
+            tensor_strides, strides, kernel_sizes,
+            dilations, region_type,
+            is_transpose, is_pool,
+            offsets, false);
+      }
+    } else {
+      const bool is_identity =
+          std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; });
+      ASSERT(is_identity == false,
+          "It is meaningless of identity in transpose conv");
+
+      if (is_pool && strides == kernel_sizes && region_type == 0) {
+        CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+        CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+        auto out_coords_key = createTransposedStridedRegionCoords(
+            p_in_coords_key->getKey(), tensor_strides, strides, kernel_sizes,
+            dilations, region_type, offsets, force_creation);
+
+        p_out_coords_key->setKey(out_coords_key);
+        if (!p_out_coords_key->tensor_stride_set) {
+          p_out_coords_key->setTensorStride(tensor_strides);
+          p_out_coords_key->up_stride(strides);
+        }
+
+        return getStridedInOutMaps(
+            py_in_coords_key, py_out_coords_key,
+            tensor_strides, strides,
+            kernel_sizes, dilations, region_type,
+            is_transpose, is_pool,
+            false);
+      } else {
+        return getTransposedStridedRegionInOutMaps(
+            py_in_coords_key, py_out_coords_key,
+            tensor_strides, strides, kernel_sizes,
+            dilations, region_type,
+            is_transpose, is_pool,
+            offsets, force_creation);
+      }
+    }
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::getOriginInOutMaps(py::object py_in_coords_key,
+                                           py::object py_out_coords_key) {
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+
+  const int D = p_in_coords_key->getDimension();
+  // Create output coordinates if it doesn't exist
+  if (!p_out_coords_key->key_set) {
+    p_out_coords_key->setKey(createOriginCoords(D));
+    const vector<int> zero_vec(D, 0);
+    p_out_coords_key->setTensorStride(zero_vec);
+  }
+
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  const uint64_t out_coords_key = p_out_coords_key->getKey();
+
+  // Map key for origin hash map
+  const InOutMapKey map_key =
+      getOriginMapHashKey(py_in_coords_key, py_out_coords_key);
+
+  // For non transpose case
+  // make a kernel mapping. The kernel will be saved with the map_key.
+  if (in_maps.find(map_key) == in_maps.end()) {
+    ASSERT(coords_maps[out_coords_key]->size() == batch_size,
+           "Coords size mismatch. GPUCoordsMap size: ",
+           coords_maps[out_coords_key]->size(),
+           ", batch size: ", batch_size);
+    const auto nrows = coords_maps[in_coords_key]->nrows;
+    vector<at::Tensor> th_ins(1, torch::empty(
+              {static_cast<int>(nrows)},
+              torch::TensorOptions().dtype(torch::kInt32)));
+    vector<at::Tensor> th_outs(1, torch::empty(
+              {static_cast<int>(nrows)},
+              torch::TensorOptions().dtype(torch::kInt32)));
+    int* p_in = th_ins[0].data<int>();
+    int* p_out = th_outs[0].data<int>();
+    coords_maps[out_coords_key]->batch_search(
+                                coords_maps[in_coords_key],
+                                p_in, p_out, nrows);
+    in_maps[map_key] = move(th_ins);
+    out_maps[map_key] = move(th_outs);
+  }
+  return map_key;
+}
+
+template <typename MapType>
+pair<vector<at::Tensor>, vector<at::Tensor>>
+GPUCoordsManager<MapType>::getUnionMap(vector<py::object> py_in_coords_keys,
+                                    py::object py_out_coords_key) {
+
+  // all exception handling will be done inside the following
+  const auto map_key = getUnionInOutMaps(py_in_coords_keys, py_out_coords_key);
+
+  return {in_maps[map_key], out_maps[map_key]};
+}
+
+// WARNING(ljm): this is not a in-use function
+template <typename MapType>
+uint64_t
+GPUCoordsManager<MapType>::createUnionCoords(vector<py::object> py_in_coords_keys,
+                                          py::object py_out_coords_key) {
+
+  //vector<reference_wrapper<GPUCoordsMap<MapType>>> in_coords_maps(py_in_coords_keys.size());
+  vector<std::shared_ptr<GPUCoordsMap<MapType>>> in_coords_maps(py_in_coords_keys.size());
+  vector<int> in_coords_map_sizes(py_in_coords_keys.size());
+  CoordsKey *p_in_coords_key = py_in_coords_keys[0].cast<CoordsKey *>();
+  auto tensor_strides = p_in_coords_key->getTensorStride();
+  //GPUCoordsMap<MapType>& curr_map = coords_maps[p_in_coords_key->getKey()];
+  in_coords_maps[0] = coords_maps[p_in_coords_key->getKey()];
+  in_coords_map_sizes[0] = in_coords_maps[0]->nrows;
+  int total_in_keys = in_coords_map_sizes[0];
+  for (size_t i = 1; i != py_in_coords_keys.size(); ++i) {
+    // Set the tensor strides to the smallest elements.
+    p_in_coords_key = py_in_coords_keys[i].cast<CoordsKey *>();
+    transform(tensor_strides.begin(),                            /* In1 begin */
+              tensor_strides.end(),                              /* In1 end */
+              p_in_coords_key->getTensorStride().begin(),        /* In2 begin */
+              tensor_strides.begin(),                            /* out begin */
+              [](int a, int b) -> int { return std::min(a, b); } /* binary op */
+    );
+    in_coords_maps[i] = coords_maps[p_in_coords_key->getKey()];
+    in_coords_map_sizes[i] = in_coords_maps[i]->nrows;
+    total_in_keys += in_coords_map_sizes[i];
+
+    const uint64_t in_coords_key = p_in_coords_key->getKey();
+    ASSERT(existsCoordsKey(in_coords_key),
+           "The coord map doesn't exist for the given coords_key: ",
+           to_string(in_coords_key), ".");
+  }
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+
+  // set a random coords key
+  const uint64_t out_coords_key = getRandomCoordsKey();
+
+  // Set the pycoordskey using the last coords_key
+  p_out_coords_key->setDimension(p_in_coords_key->getDimension());
+  p_out_coords_key->setKey(out_coords_key);
+  p_out_coords_key->setTensorStride(tensor_strides);
+
+  coords_maps[out_coords_key] =
+      std::make_shared<GPUCoordsMap<MapType>>(total_in_keys, 1.0 / in_coords_map_sizes.size());
+
+  auto out_nrows = coords_maps[out_coords_key]->union_insert(in_coords_maps,
+                                           in_coords_map_sizes);
+
+  if (out_nrows < min_nrows) {
+    min_nrows = out_nrows;
+    min_coords_key = out_coords_key;
+  }
+  return out_coords_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::createUnionInOutMaps(const vector<py::object>& py_in_coords_keys,
+                                    py::object py_out_coords_key) {
+
+  //vector<reference_wrapper<GPUCoordsMap<MapType>>> in_coords_maps(py_in_coords_keys.size());
+  vector<std::shared_ptr<GPUCoordsMap<MapType>>> in_coords_maps(py_in_coords_keys.size());
+  vector<int> in_coords_map_sizes(py_in_coords_keys.size());
+  vector<at::Tensor> th_ins(py_in_coords_keys.size());
+  vector<at::Tensor> th_outs(py_in_coords_keys.size());
+  vector<int*> p_ins(py_in_coords_keys.size());
+  vector<int*> p_outs(py_in_coords_keys.size());
+  CoordsKey *p_in_coords_key = py_in_coords_keys[0].cast<CoordsKey *>();
+  auto tensor_strides = p_in_coords_key->getTensorStride();
+  in_coords_maps[0] = coords_maps[p_in_coords_key->getKey()];
+  in_coords_map_sizes[0] = in_coords_maps[0]->nrows;
+  int total_in_keys = in_coords_map_sizes[0];
+  th_ins[0] = torch::empty(
+        {static_cast<int>(in_coords_map_sizes[0])}, torch::TensorOptions().dtype(torch::kInt32));
+  th_outs[0] = torch::empty(
+        {static_cast<int>(in_coords_map_sizes[0])}, torch::TensorOptions().dtype(torch::kInt32));
+  p_ins[0] = th_ins[0].data<int>();
+  p_outs[0] = th_outs[0].data<int>();
+  for (size_t i = 1; i != py_in_coords_keys.size(); ++i) {
+    // Set the tensor strides to the smallest elements.
+    p_in_coords_key = py_in_coords_keys[i].cast<CoordsKey *>();
+    transform(tensor_strides.begin(),                            /* In1 begin */
+              tensor_strides.end(),                              /* In1 end */
+              p_in_coords_key->getTensorStride().begin(),        /* In2 begin */
+              tensor_strides.begin(),                            /* out begin */
+              [](int a, int b) -> int { return std::min(a, b); } /* binary op */
+    );
+    in_coords_maps[i] = coords_maps[p_in_coords_key->getKey()];
+    in_coords_map_sizes[i] = in_coords_maps[i]->nrows;
+    total_in_keys += in_coords_map_sizes[i];
+    th_ins[i] = torch::empty(
+          {static_cast<int>(in_coords_map_sizes[i])}, torch::TensorOptions().dtype(torch::kInt32));
+    th_outs[i] = torch::empty(
+          {static_cast<int>(in_coords_map_sizes[i])}, torch::TensorOptions().dtype(torch::kInt32));
+    p_ins[i] = th_ins[i].data<int>();
+    p_outs[i] = th_outs[i].data<int>();
+
+    const uint64_t in_coords_key = p_in_coords_key->getKey();
+    ASSERT(existsCoordsKey(in_coords_key),
+           "The coord map doesn't exist for the given coords_key: ",
+           to_string(in_coords_key), ".");
+  }
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+
+  // set a random coords key
+  const uint64_t out_coords_key = getRandomCoordsKey();
+
+  // Set the pycoordskey using the last coords_key
+  p_out_coords_key->setDimension(p_in_coords_key->getDimension());
+  p_out_coords_key->setKey(out_coords_key);
+  p_out_coords_key->setTensorStride(tensor_strides);
+
+  coords_maps[out_coords_key] =
+      std::make_shared<GPUCoordsMap<MapType>>(total_in_keys, 1.0 / in_coords_map_sizes.size());
+
+  auto out_nrows = coords_maps[out_coords_key]->union_insert_search(in_coords_maps,
+                                                  p_ins, p_outs,
+                                                  in_coords_map_sizes);
+
+  if (out_nrows < min_nrows) {
+    min_nrows = out_nrows;
+    min_coords_key = out_coords_key;
+  }
+  // Map key for origin hash map
+  const InOutMapKey map_key =
+      getUnionMapHashKey(py_in_coords_keys, py_out_coords_key);
+
+  in_maps[map_key] = move(th_ins);
+  out_maps[map_key] = move(th_outs);
+
+  return map_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::getUnionInOutMaps(vector<py::object> py_in_coords_keys,
+                                          py::object py_out_coords_key) {
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+
+  // Create output coordinates if it doesn't exist
+  if (!p_out_coords_key->key_set)
+    return createUnionInOutMaps(py_in_coords_keys, py_out_coords_key);
+
+  const uint64_t out_coords_key = p_out_coords_key->getKey();
+
+  // Map key for origin hash map
+  const InOutMapKey map_key =
+    getUnionMapHashKey(py_in_coords_keys, py_out_coords_key);
+
+  if (in_maps.find(map_key) == in_maps.end()) {
+    //vector<reference_wrapper<GPUCoordsMap<MapType>>> in_coords_maps(py_in_coords_keys.size());
+    vector<std::shared_ptr<GPUCoordsMap<MapType>>> in_coords_maps(py_in_coords_keys.size());
+    vector<int> in_coords_map_sizes(py_in_coords_keys.size());
+    vector<at::Tensor> th_ins(py_in_coords_keys.size());
+    vector<at::Tensor> th_outs(py_in_coords_keys.size());
+    vector<int*> p_ins(py_in_coords_keys.size());
+    vector<int*> p_outs(py_in_coords_keys.size());
+    CoordsKey *p_in_coords_key = py_in_coords_keys[0].cast<CoordsKey *>();
+    auto tensor_strides = p_in_coords_key->getTensorStride();
+    in_coords_maps[0] = coords_maps[p_in_coords_key->getKey()];
+    in_coords_map_sizes[0] = in_coords_maps[0]->nrows;
+    int total_in_keys = in_coords_map_sizes[0];
+    th_ins[0] = torch::empty(
+          {static_cast<int>(in_coords_map_sizes[0])}, torch::TensorOptions().dtype(torch::kInt32));
+    th_outs[0] = torch::empty(
+          {static_cast<int>(in_coords_map_sizes[0])}, torch::TensorOptions().dtype(torch::kInt32));
+    p_ins[0] = th_ins[0].data<int>();
+    p_outs[0] = th_outs[0].data<int>();
+    for (size_t i = 1; i != py_in_coords_keys.size(); ++i) {
+      // Set the tensor strides to the smallest elements.
+      p_in_coords_key = py_in_coords_keys[i].cast<CoordsKey *>();
+      transform(tensor_strides.begin(),                            /* In1 begin */
+                tensor_strides.end(),                              /* In1 end */
+                p_in_coords_key->getTensorStride().begin(),        /* In2 begin */
+                tensor_strides.begin(),                            /* out begin */
+                [](int a, int b) -> int { return std::min(a, b); } /* binary op */
+      );
+      in_coords_maps[i] = coords_maps[p_in_coords_key->getKey()];
+      in_coords_map_sizes[i] = in_coords_maps[i]->nrows;
+      total_in_keys += in_coords_map_sizes[i];
+      th_ins[i] = torch::empty(
+            {static_cast<int>(in_coords_map_sizes[i])}, torch::TensorOptions().dtype(torch::kInt32));
+      th_outs[i] = torch::empty(
+            {static_cast<int>(in_coords_map_sizes[i])}, torch::TensorOptions().dtype(torch::kInt32));
+      p_ins[i] = th_ins[i].data<int>();
+      p_outs[i] = th_outs[i].data<int>();
+
+      const uint64_t in_coords_key = p_in_coords_key->getKey();
+      ASSERT(existsCoordsKey(in_coords_key),
+             "The coord map doesn't exist for the given coords_key: ",
+             to_string(in_coords_key), ".");
+    }
+
+    coords_maps[out_coords_key]->union_search(in_coords_maps,
+                                             p_ins, p_outs,
+                                             in_coords_map_sizes);
+    in_maps[map_key] = move(th_ins);
+    out_maps[map_key] = move(th_outs);
+  }
+
+  return map_key;
+}
+
+template <typename MapType>
+uint64_t
+GPUCoordsManager<MapType>::createPruningCoords(at::Tensor use_feat,
+                                           py::object py_in_coords_key,
+                                           py::object py_out_coords_key) {
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  ASSERT(!p_out_coords_key->isKeySet(),
+         "p_out_coords_key should be unsetted");
+
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+
+  ASSERT(existsCoordsKey(in_coords_key),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(in_coords_key), ".");
+
+  // set a random coords key
+  const uint64_t out_coords_key = getRandomCoordsKey();
+
+  // Set the pycoordskey
+  p_out_coords_key->setKey(out_coords_key);
+  p_out_coords_key->setDimension(p_in_coords_key->getDimension());
+  if (!p_out_coords_key->tensor_stride_set)
+    p_out_coords_key->setTensorStride(p_in_coords_key->getTensorStride());
+
+  coords_maps[out_coords_key] = std::make_shared<GPUCoordsMap<MapType>>(use_feat.size(0));
+  auto out_nrows = coords_maps[out_coords_key]->prune_insert(coords_maps[in_coords_key],
+                                           use_feat.data<bool>(),
+                                           use_feat.size(0),
+                                           coords_maps[in_coords_key]->nrows);
+
+  if (out_nrows < min_nrows) {
+    min_nrows = out_nrows;
+    min_coords_key = out_coords_key;
+  }
+  return out_coords_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::createPruningInOutMaps(at::Tensor use_feat,
+                                            py::object py_in_coords_key,
+                                            py::object py_out_coords_key) {
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+  ASSERT(!p_out_coords_key->isKeySet(),
+         "p_out_coords_key should be unsetted");
+
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  ASSERT(existsCoordsKey(in_coords_key),
+         "The coord map doesn't exist for the given coords_key: ",
+         to_string(in_coords_key), ".");
+
+  const uint64_t out_coords_key = getRandomCoordsKey();
+  p_out_coords_key->setKey(out_coords_key);
+  p_out_coords_key->setDimension(p_in_coords_key->getDimension());
+  if (!p_out_coords_key->tensor_stride_set)
+    p_out_coords_key->setTensorStride(p_in_coords_key->getTensorStride());
+
+  const InOutMapKey map_key =
+      getOriginMapHashKey(py_in_coords_key, py_out_coords_key);
+
+  if (in_maps.find(map_key) != in_maps.end()) return map_key;
+
+  vector<at::Tensor> th_ins(1, torch::empty(
+            {static_cast<int>(use_feat.size(0))},
+            torch::TensorOptions().dtype(torch::kInt32)));
+  vector<at::Tensor> th_outs(1, torch::empty(
+            {static_cast<int>(use_feat.size(0))},
+            torch::TensorOptions().dtype(torch::kInt32)));
+
+  int* p_in = th_ins[0].data<int>();
+  int* p_out = th_outs[0].data<int>();
+
+  coords_maps[out_coords_key] = std::make_shared<GPUCoordsMap<MapType>>(use_feat.size(0));
+  auto out_nrows = coords_maps[out_coords_key]->prune_insert_search(coords_maps[in_coords_key],
+                                                  p_in, p_out,
+                                                  use_feat.data<bool>(),
+                                                  use_feat.size(0),
+                                                  coords_maps[in_coords_key]->nrows);
+
+  if (out_nrows < min_nrows) {
+    min_nrows = out_nrows;
+    min_coords_key = out_coords_key;
+  }
+  in_maps[map_key] = move(th_ins);
+  out_maps[map_key] = move(th_outs);
+  return map_key;
+}
+
+template <typename MapType>
+const InOutMapKey
+GPUCoordsManager<MapType>::getPruningInOutMaps(at::Tensor use_feat,
+                                            py::object py_in_coords_key,
+                                            py::object py_out_coords_key) {
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  CoordsKey *p_out_coords_key = py_out_coords_key.cast<CoordsKey *>();
+
+  // Create output coordinates if it doesn't exist
+  if (!p_out_coords_key->key_set) {
+    // The following function setup py_out_coords_key
+    return createPruningInOutMaps(use_feat, py_in_coords_key, py_out_coords_key);
+  }
+
+  const uint64_t in_coords_key = p_in_coords_key->getKey();
+  const uint64_t out_coords_key = p_out_coords_key->getKey();
+
+  // Use the map key for origin hash map (stride, dilation, kernel are all
+  // NULL)
+  const InOutMapKey map_key =
+      getOriginMapHashKey(py_in_coords_key, py_out_coords_key);
+
+  // For non transpose case
+  // make a kernel mapping. The kernel will be saved with the map_key.
+  if (in_maps.find(map_key) == in_maps.end()) {
+    vector<at::Tensor> th_ins(1, torch::empty(
+              {static_cast<int>(use_feat.size(0) + 1)},
+              torch::TensorOptions().dtype(torch::kInt32)));
+    vector<at::Tensor> th_outs(1, torch::empty(
+              {static_cast<int>(use_feat.size(0))},
+              torch::TensorOptions().dtype(torch::kInt32)));
+
+    int* p_in = th_ins[0].data<int>();
+    int* p_out = th_outs[0].data<int>();
+
+//    coords_maps[out_coords_key] = GPUCoordsMap<MapType>(use_feat.size(0));
+    coords_maps[out_coords_key]->prune_search(coords_maps[in_coords_key],
+                                             p_in, p_out,
+                                             use_feat.data<bool>(),
+                                             use_feat.size(0),
+                                             coords_maps[in_coords_key]->nrows);
+    int size = *(p_in + use_feat.size(0));
+    th_ins[0].resize_(size);
+    th_outs[0].resize_(size);
+    in_maps[map_key] = move(th_ins);
+    out_maps[map_key] = move(th_outs);
+  }
+
+  return map_key;
+}
+
+
+template <typename MapType> string GPUCoordsManager<MapType>::toString() const {
+  Formatter out;
+  out << "< GPUCoordsManager\n\tNumber of Coordinate Maps: "
+      << to_string(coords_maps.size());
+  for (const auto &kv : coords_maps) {
+    out << " \n\t\tCoordinate Map Key: " << to_string(kv.first)
+        << ", Size: " << to_string((kv.second)->size());
+  }
+  out << "\n\tNumber of Kernel Maps: " << to_string(in_maps.size());
+  for (const auto &kv : in_maps) {
+    size_t size = 0;
+    for (const auto &map : kv.second)
+      size += map.size(0);
+    out << " \n\t\tKernel In-Out Map Key: "
+        << to_string(hash_vec<InOutMapKey>(kv.first))
+        << ", Size: " << to_string(size);
+  }
+  out << " >\n";
+  return out;
+}
+
+// TODO(ljm): implement GPUCoordsMap<MapType>::print
+/*
+template <typename MapType>
+void GPUCoordsManager<MapType>::printDiagnostics(py::object py_coords_key) const {
+  CoordsKey *p_coords_key = py_coords_key.cast<CoordsKey *>();
+  const auto &map_iter = coords_maps.find(p_coords_key->getKey());
+  ASSERT(map_iter != coords_maps.end(), "Coords map does not exist.");
+  map_iter->second.print();
+}
+*/
+
+/*
+ * Return row indices for each batch index
+ */
+template <typename MapType>
+at::Tensor
+GPUCoordsManager<MapType>::getRowIndicesAtBatchIndex(py::object py_in_coords_key,
+                                                  py::object py_out_coords_key,
+                                                  const int batch_index) {
+  // py_out_coords_key will be set after the above call.
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  const auto in_coords_key = p_in_coords_key->getKey();
+  const auto in_map_iter = coords_maps.find(in_coords_key);
+  ASSERT(in_map_iter != coords_maps.end(),
+         "The in_coords_key, ", to_string(in_coords_key), ", does not exist.");
+
+  const auto& coordsmap = in_map_iter->second;
+  const auto nrows = coordsmap->nrows;
+  //const auto batch_num = coordsmap->get_batch_num();
+  const auto batch_num = getBatchSize();
+  ASSERT(batch_index < batch_num, "batch_index: ", to_string(batch_index),
+                       " must smaller than batch_num: ", to_string(batch_num));
+
+  at::Tensor out_ind = torch::zeros(
+        {static_cast<int>(nrows + 1)}, torch::TensorOptions().dtype(torch::kInt32).device(device));
+  int* p_out_ind = out_ind.data<int>();
+  //coordsmap.GetIndexAtBatch(p_out_ind, batch_index);
+  coordsmap->get_index_at_batch(p_out_ind, batch_index, nrows);
+  int size = *(p_out_ind + nrows);
+  out_ind.resize_({size});
+  //out_ind.resize_(c10::IntArrayRef(reinterpret_cast<int64_t*>(p_out_ind + nrows), 1));
+
+  return out_ind;
+}
+
+/*
+ * Return row indices per batch
+ */
+template <typename MapType>
+vector<at::Tensor>
+GPUCoordsManager<MapType>::getRowIndicesPerBatch(py::object py_in_coords_key,
+                                              py::object py_out_coords_key) {
+  // py_out_coords_key will be set after the above call.
+  CoordsKey *p_in_coords_key = py_in_coords_key.cast<CoordsKey *>();
+  const auto in_coords_key = p_in_coords_key->getKey();
+  const auto in_map_iter = coords_maps.find(in_coords_key);
+  ASSERT(in_map_iter != coords_maps.end(),
+         "The in_coords_key, ", to_string(in_coords_key), ", does not exist.");
+
+  const auto& coordsmap = in_map_iter->second;
+//  const auto batch_num = coordsmap->get_batch_num();
+  const auto batch_num = getBatchSize();
+  const auto nrows = coordsmap->nrows;
+  // Return index.
+  vector<at::Tensor> out_inds(batch_num, torch::zeros(
+        {static_cast<int>(nrows + 1)}, torch::TensorOptions().dtype(torch::kInt32)));
+  vector<int*> p_out_inds(batch_num);
+  for (size_t b = 0; b != batch_num; ++b) p_out_inds[b] = out_inds[b].data<int>();
+  //coordsmap.GetIndexPerBatch(p_out_inds);
+  coordsmap->get_index_per_batch(p_out_inds, nrows);
+  for (size_t b = 0; b != batch_num; ++b) {
+    int size = *(p_out_inds[b] + nrows);
+    out_inds[b].resize_({size});
+//    out_inds[b].resize_(c10::IntArrayRef(reinterpret_cast<int64_t*>(p_out_inds[b] + nrows), 1));
+  }
+
+  return out_inds;
+}
+
+template class GPUCoordsManager<CoordsToIndexMapGPU>;
+
+} // end namespace minkowski
diff --git a/src/gpu_coords_manager.hpp b/src/gpu_coords_manager.hpp
new file mode 100644
index 00000000..bdb8b644
--- /dev/null
+++ b/src/gpu_coords_manager.hpp
@@ -0,0 +1,291 @@
+/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
+ * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
+ * of the code.
+ */
+#ifndef GPU_COORDS_MAN
+#define GPU_COORDS_MAN
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <omp.h>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <torch/extension.h>
+
+#include "gpu_coordsmap.hpp"
+#include "types.hpp"
+#include "utils.hpp"
+
+#ifndef CPU_ONLY
+#include "gpu_memory_manager.hpp"
+#include <ATen/cuda/CUDAContext.h>
+#endif // CPU_ONLY
+
+namespace minkowski {
+
+using std::begin;
+using std::end;
+using std::get;
+using std::move;
+using std::ref;
+using std::string;
+using std::to_string;
+using std::unordered_map;
+
+/*
+inline vector<int> computeOutTensorStride(const vector<int> &tensor_strides,
+                                          const vector<int> &strides,
+                                          bool is_transpose) {
+  vector<int> out_tensor_strides;
+  ASSERT(tensor_strides.size() == strides.size(),
+         "The dimension of tensor_stride: ", ArrToString(tensor_strides),
+         " does not match the dimension of strides: ", ArrToString(strides));
+  for (size_t i = 0; i < strides.size(); i++) {
+    if (is_transpose) {
+      ASSERT(tensor_strides[i] % strides[i] == 0,
+             "The output tensor stride is not divisible by ",
+             "up_strides. tensor stride: ", ArrToString(tensor_strides),
+             ", up_strides: ", ArrToString(strides));
+      out_tensor_strides.push_back(tensor_strides[i] / strides[i]);
+    } else
+      out_tensor_strides.push_back(tensor_strides[i] * strides[i]);
+  }
+  return out_tensor_strides;
+}
+*/
+
+#ifndef CPU_ONLY
+
+template <typename VType> int getInOutMapsSizeGPU(const VType &map) {
+  int n = 0;
+  for (auto cmap = begin(map); cmap != end(map); ++cmap)
+    n += cmap->size(0);
+  return n;
+}
+
+template <typename MapType = CoordsToIndexMapGPU> class GPUCoordsManager {
+public:
+  // Variables
+  //
+  // Coordinate hash key to coordinate hash map
+  unordered_map<uint64_t, std::shared_ptr<GPUCoordsMap<MapType>>> coords_maps;
+
+  set<int> batch_indices;
+  int batch_size;
+  int D;
+  int device_id;
+  c10::Device device;
+  int min_nrows;
+  uint64_t min_coords_key;
+
+  std::shared_ptr<GPUMemoryManager> gpu_memory_manager;
+
+  // In to out index mapping for each kernel, pooling
+  unordered_map<InOutMapKey, vector<at::Tensor>, InOutMapKeyHash> in_maps;
+  unordered_map<InOutMapKey, vector<at::Tensor>, InOutMapKeyHash> out_maps;
+
+  GPUCoordsManager(int D,
+                   int device_id,
+                   MemoryManagerBackend backend) : batch_size(-1), device(c10::DeviceType::CUDA, 0) {
+    gpu_memory_manager = std::make_shared<GPUMemoryManager>(backend);
+    this->device_id = device_id;
+    this->D = D;
+    min_nrows = INT_MAX;
+  }
+  ~GPUCoordsManager() { clear(); }
+
+// TODO(ljm): implement GPUCoordsMap<MapType>::print
+//  void printDiagnostics(py::object py_coords_key) const;
+
+  uint64_t getCoordsKey(const vector<int> &tensor_strides) const;
+  bool existsCoordsKey(uint64_t coords_key) const;
+  bool existsCoordsKey(py::object py_coords_key) const;
+  bool existsInOutMapKey(const InOutMapKey &map_key) const {
+    return in_maps.find(map_key) != in_maps.end();
+  }
+  int getCoordsSize(uint64_t coords_key) const;
+  int getCoordsSize(py::object py_coords_key) const;
+  uint64_t getRandomCoordsKey();
+  long int getBatchSize();
+  set<int> getBatchIndices() {
+    if (batch_indices.empty()) {
+      for (int b = 0; b != getBatchSize(); ++b) batch_indices.insert(b);
+    }
+    ASSERT((int)batch_indices.size() == getBatchSize(),
+           "batch_indices.size() must be equal to getBatchSize()");
+    return batch_indices;
+  }
+  void getCoords(at::Tensor coords, py::object py_coords_key) const;
+  vector<vector<at::Tensor>>
+  getKernelMap(const vector<int>& tensor_strides, const vector<int>& strides,
+               const vector<int>& kernel_sizes, const vector<int>& dilations, int region_type,
+               at::Tensor offsets, py::object py_in_coords_key,
+               py::object py_out_coords_key, bool is_transpose, bool is_pool);
+  // TODO make this function non-const with ability to generate a new map
+  vector<at::Tensor> getCoordsMap(py::object py_in_coords_key,
+                                  py::object py_out_coords_key) const;
+  pair<vector<at::Tensor>, vector<at::Tensor>>
+  getUnionMap(vector<py::object> py_in_coords_keys,
+              py::object py_out_coords_key);
+
+  // Set the py_coords_key to the origin coords map key
+  void setOriginCoordsKey(py::object py_coords_key);
+
+  // New coords map initialzation entry
+  uint64_t initializeCoords(at::Tensor coords, at::Tensor mapping,
+                            at::Tensor inverse_mapping,
+                            const vector<int> &tensor_strides,
+                            const bool force_creation, const bool force_remap,
+                            const bool allow_duplicate_coords,
+                            const bool return_inverse);
+
+  uint64_t initializeCoords(at::Tensor coords, at::Tensor mapping,
+                            at::Tensor inverse_mapping,
+                            py::object py_coords_key, const bool force_creation,
+                            const bool force_remap,
+                            const bool allow_duplicate_coords,
+                            const bool return_inverse);
+
+  // New coords map given an input
+  uint64_t createStridedCoords(uint64_t coords_key,
+                               const vector<int> &tensor_strides,
+                               const vector<int> &strides, bool force_creation);
+  uint64_t createTransposedStridedRegionCoords(
+      uint64_t coords_key, const vector<int> &tensor_strides,
+      const vector<int> &strides, vector<int> kernel_sizes,
+      vector<int> dilations, int region_type, at::Tensor offsets,
+      bool force_creation);
+  uint64_t createPruningCoords(at::Tensor use_feat, py::object py_in_coords_key,
+                              py::object py_out_coords_key);
+  uint64_t createOriginCoords(const int D);
+  uint64_t createUnionCoords(vector<py::object> py_in_coords_keys,
+                             py::object py_out_coords_key);
+
+  // Mappings
+  const InOutMapKey getMapHashKey(vector<int> tensor_strides,
+                                  vector<int> strides, vector<int> kernel_sizes,
+                                  vector<int> dilations, int region_type,
+                                  py::object py_in_coords_key,
+                                  py::object py_out_coords_key,
+                                  bool is_transpose, bool is_pool) const;
+  const InOutMapKey getOriginMapHashKey(py::object py_in_coords_key,
+                                        py::object py_out_coords_key) const;
+  const InOutMapKey getUnionMapHashKey(vector<py::object> py_in_coords_keys,
+                                       py::object py_out_coords_key) const;
+
+  // Wrapper functions for setting up coords and returning maps
+  const InOutMapKey
+  getInOutMaps(const vector<int> &tensor_strides, const vector<int> &strides,
+               const vector<int> &kernel_sizes, const vector<int> &dilations,
+               int region_type, const at::Tensor &offsets,
+               py::object py_in_coords_key, py::object py_out_coords_key,
+               bool is_transpose, bool is_pool = false,
+               bool generate_new_coords = false);
+
+  const InOutMapKey getOriginInOutMaps(py::object py_in_coords_key,
+                                    py::object py_out_coords_key);
+
+  const InOutMapKey getPruningInOutMaps(at::Tensor use_feat,
+                                                  py::object py_in_coords_key,
+                                                  py::object py_out_coords_key);
+
+  const InOutMapKey
+  getUnionInOutMaps(vector<py::object> py_in_coords_keys,
+                    py::object py_out_coords_key);
+
+  const InOutMapKey
+  getStridedInOutMaps(
+      py::object py_in_coords_key, py::object py_out_coords_key,
+      const vector<int>& tensor_strides, const vector<int>& strides,
+      const vector<int>& kernel_sizes, const vector<int>& dilations, int region_type,
+      bool is_transpose, bool is_pool,
+      bool force_creation);
+
+  const InOutMapKey
+  createStridedInOutMaps(
+      py::object py_in_coords_key, py::object py_out_coords_key,
+      const vector<int> &tensor_strides,
+      const vector<int> &strides,
+      vector<int> kernel_sizes, vector<int> dilations, int region_type,
+      bool is_transpose, bool is_pool,
+      bool force_creation);
+
+  const InOutMapKey
+  getTransposedStridedRegionInOutMaps(
+      py::object py_in_coords_key, py::object py_out_coords_key,
+      const vector<int>& tensor_strides,
+      const vector<int>& strides, const vector<int>& kernel_sizes, const vector<int>& dilations,
+      int region_type,
+      bool is_transpose, bool is_pool,
+      at::Tensor offsets,
+      bool force_creation);
+
+  const InOutMapKey
+  createTransposedStridedRegionInOutMaps(
+      py::object py_in_coords_key, py::object py_out_coords_key,
+      const vector<int>& tensor_strides,
+      const vector<int>& strides, const vector<int>& kernel_sizes, const vector<int>& dilations,
+      int region_type,
+      bool is_transpose, bool is_pool,
+      at::Tensor offsets, bool force_creation);
+
+  const InOutMapKey
+  createUnionInOutMaps(const vector<py::object>& py_in_coords_keys,
+                       py::object py_out_coords_key);
+
+  const InOutMapKey
+  createPruningInOutMaps(at::Tensor use_feat,
+                         py::object py_in_coords_key,
+                         py::object py_out_coords_key);
+
+  string toString() const;
+  void clear() {
+    coords_maps.clear();
+    in_maps.clear();
+    out_maps.clear();
+  }
+
+  at::Tensor getRowIndicesAtBatchIndex(py::object py_in_coords_key,
+                                       py::object py_out_coords_key,
+                                       const int batch_index);
+  vector<at::Tensor> getRowIndicesPerBatch(py::object py_in_coords_key,
+                                           py::object py_out_coords_key);
+
+  void *getScratchGPUMemory(size_t size) {
+    return gpu_memory_manager.get()->tmp_data(size);
+  }
+
+  void clearScratchGPUMemory() { gpu_memory_manager.get()->clear_tmp(); }
+
+};     // gpucoordsmanager
+#endif
+
+} // namespace minkowski
+
+#endif // GPU_COORDS_MAN
diff --git a/src/gpu_coordsmap.cpp b/src/gpu_coordsmap.cpp
new file mode 100644
index 00000000..874afbad
--- /dev/null
+++ b/src/gpu_coordsmap.cpp
@@ -0,0 +1,308 @@
+/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
+ * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
+ * of the code.
+ */
+#include <iostream>
+#include <numeric>
+#include <omp.h>
+
+#include "gpu_coordsmap.hpp"
+
+namespace minkowski {
+
+/*
+ * Use this function when batch_size is setted outside
+template <typename MapType>
+GPUCoordsMap<MapType>::GPUCoordsMap(int ncols_, int batch_size)
+    : nrows(batch_size), ncols(ncols_) {
+  map->BulkBatchIndiceInsert(ncols_, batch_size);
+}
+*/
+
+// TODO(ljm): add prune_insert, prune_insert_search, prune_search
+
+/*
+template <typename MapType>
+GPUCoordsMap<MapType>::GPUCoordsMap(uint32_t map_size, float duplicate_factor,
+                     uint32_t keys_per_bucket=62, const uint32_t device_id=0) {
+    // TODO(ljm): add this api
+    map->reserve(map_size, duplicate_factor,
+                keys_per_bucket, device_id);
+}
+*/
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::initialize_batch(const int* p_coords,
+                                     int* p_mapping,
+                                     int* p_inverse_mapping,
+                                     const int nrows_,
+                                     const int ncols_, const bool force_remap,
+                                     const bool return_inverse) {
+  nrows = nrows_;
+  ncols = ncols_;
+
+  map->BulkInsert(p_coords, p_mapping, p_inverse_mapping, nrows, ncols);
+
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+void GPUCoordsMap<MapType>::get_coords(int* p_coords, int size) {
+    map->IterateKeys(p_coords, size);
+}
+
+template <typename MapType>
+void GPUCoordsMap<MapType>::get_index_at_batch(int* p_out,
+                                            int batch_index,
+                                            int nrows_) {
+    map->IterateSearchAtBatch(p_out, batch_index, nrows_);
+}
+
+template <typename MapType>
+void GPUCoordsMap<MapType>::get_index_per_batch(
+                                const vector<int*>& p_outs,
+                                int nrows_) {
+    map->IterateSearchPerBatch(p_outs, nrows_);
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+//GPUCoordsMap<MapType>::region_insert(const GPUCoordsMap<MapType>& in_coords_map,
+GPUCoordsMap<MapType>::region_insert(const std::shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                  const Region &region, int size) {
+  ASSERT(region.tensor_strides.size() == ncols - 1, "Invalid tensor strides");
+
+  vector<at::Tensor> offsets(region.size(), torch::empty(
+        {static_cast<int>(ncols)}, torch::TensorOptions().dtype(torch::kInt32)));
+  vector<int> origin(ncols, 0);
+  Region cregion(region);
+  cregion.set_bounds(origin);
+  int c = 0;
+  for (const auto& point : cregion) {
+    CHECK_CUDA(cudaMemcpy(offsets[c].data<int>(), point.data(),
+                          sizeof(int) * ncols,
+                          cudaMemcpyHostToDevice));
+    map->IterateOffsetInsert(in_coords_map->map,
+    //map.IterateOffsetInsert(map,
+                            offsets[c].data<int>(),
+                            in_coords_map->nrows);
+    ++c;
+  }
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::region_insert_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                         const vector<int*>& p_ins,
+                                         const vector<int*>& p_outs,
+                                         const Region &region,
+                                         int size) {
+  ASSERT(region.tensor_strides.size() == ncols - 1, "Invalid tensor strides");
+
+  vector<at::Tensor> offsets(region.size(), torch::empty(
+        {static_cast<int>(ncols)}, torch::TensorOptions().dtype(torch::kInt32)));
+  vector<int> origin(ncols, 0);
+  Region cregion(region);
+  cregion.set_bounds(origin);
+  int c = 0;
+  for (const auto& point : cregion) {
+    CHECK_CUDA(cudaMemcpy(offsets[c].data<int>(), point.data(),
+                          sizeof(int) * ncols,
+                          cudaMemcpyHostToDevice));
+    map->IterateOffsetInsertWithInsOuts(in_coords_map->map,
+                                       offsets[c].data<int>(),
+                                       p_ins[c], p_outs[c],
+                                       size);
+    ++c;
+  }
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+void
+GPUCoordsMap<MapType>::region_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                  const vector<int*>& p_ins,
+                                  const vector<int*>& p_outs,
+                                  const Region &region,
+                                  int size) {
+  ASSERT(region.tensor_strides.size() == ncols - 1, "Invalid tensor strides");
+
+  vector<at::Tensor> offsets(region.size(), torch::empty(
+        {static_cast<int>(ncols)}, torch::TensorOptions().dtype(torch::kInt32)));
+  vector<int> origin(ncols, 0);
+  Region cregion(region);
+  cregion.set_bounds(origin);
+  int c = 0;
+  for (const auto& point : cregion) {
+    CHECK_CUDA(cudaMemcpy(offsets[c].data<int>(), point.data(),
+                          sizeof(int) * ncols,
+                          cudaMemcpyHostToDevice));
+    map->IterateOffsetSearch(in_coords_map->map,
+                            offsets[c].data<int>(),
+                            p_ins[c], p_outs[c],
+                            size);
+    ++c;
+  }
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::batch_insert(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                 int size) {
+  map->IterateBatchInsert(in_coords_map->map, size);
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+void
+GPUCoordsMap<MapType>::batch_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                 int* p_in, int* p_out, int size) {
+  map->IterateBatchSearch(in_coords_map->map, p_in, p_out, size);
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::prune_insert(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                    bool* p_keep, int keep_size,
+                                    int size) {
+  map->IteratePruneInsert(in_coords_map->map, p_keep, keep_size, size);
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::prune_insert_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                         int* p_in, int* p_out,
+                                         bool* p_keep, int keep_size,
+                                         int size) {
+  map->IteratePruneInsertWithInOut(in_coords_map->map,
+                                   p_in, p_out,
+                                   p_keep, keep_size, size);
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+void
+GPUCoordsMap<MapType>::prune_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                  int* p_in, int* p_out,
+                                  bool* p_keep, int keep_size,
+                                  int size) {
+  map->IteratePruneSearch(in_coords_map->map,
+                          p_in, p_out,
+                          p_keep, keep_size, size);
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::stride_insert(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                  const vector<int>& tensor_strides,
+                                  int size) {
+  map->IterateStrideInsert(in_coords_map->map, tensor_strides, size);
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::stride_insert_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                         int* p_in, int* p_out,
+                                         const vector<int>& tensor_strides,
+                                         int size) {
+  map->IterateStrideInsertWithInOut(in_coords_map->map,
+                                   p_in, p_out,
+                                   tensor_strides, size);
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+void
+GPUCoordsMap<MapType>::stride_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                  int* p_in, int* p_out,
+                                  const vector<int>& tensor_strides,
+                                  int size) {
+  map->IterateStrideSearch(in_coords_map->map,
+                          p_in, p_out,
+                          tensor_strides, size);
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::union_insert(
+    const vector<shared_ptr<GPUCoordsMap<MapType>>>& in_maps,
+    const vector<int>& in_coords_map_sizes) {
+  for (size_t i = 0; i != in_maps.size(); ++i) {
+      map->IterateInsert(in_maps[i]->map,
+                        in_coords_map_sizes[i]);
+  }
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+typename GPUCoordsMap<MapType>::value_type
+GPUCoordsMap<MapType>::union_insert_search(
+    const vector<shared_ptr<GPUCoordsMap<MapType>>>& in_maps,
+    const vector<int*>& p_ins, const vector<int*>& p_outs,
+    const vector<int>& in_coords_map_sizes) {
+  for (size_t i = 0; i != in_maps.size(); ++i) {
+      map->IterateInsertWithInsOuts(in_maps[i]->map, p_ins[i], p_outs[i],
+                                   in_coords_map_sizes[i]);
+  }
+  nrows = map->Size();
+  return nrows;
+}
+
+template <typename MapType>
+void GPUCoordsMap<MapType>::union_search(
+    const vector<shared_ptr<GPUCoordsMap<MapType>>>& in_maps,
+    const vector<int*>& p_ins, const vector<int*>& p_outs,
+    const vector<int>& in_coords_map_sizes) {
+  for (size_t i = 0; i != in_maps.size(); ++i) {
+      map->IterateSearch(in_maps[i]->map, p_ins[i], p_outs[i],
+                        in_coords_map_sizes[i]);
+  }
+}
+
+// TODO(ljm): add a debug helper function here
+/*
+template <typename MapType> void GPUCoordsMap<MapType>::print() const {
+  for (const auto &kv : map) {
+    std::cout << ArrToString(kv.first) << ":" << kv.second << "\n";
+  }
+  std::cout << std::flush;
+}
+*/
+
+template struct GPUCoordsMap<CoordsToIndexMapGPU>;
+//template struct GPUCoordsMap<CoordsToVectorMap>;
+
+} // end namespace minkowski
diff --git a/src/gpu_coordsmap.hpp b/src/gpu_coordsmap.hpp
new file mode 100644
index 00000000..a55fcb4e
--- /dev/null
+++ b/src/gpu_coordsmap.hpp
@@ -0,0 +1,260 @@
+/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
+ * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
+ * of the code.
+ */
+#ifndef GPU_COORDSMAP
+#define GPU_COORDSMAP
+
+#include <cmath>
+#include <memory>
+#include <set>
+#include <tuple>
+#include <torch/extension.h>
+
+#include "3rdparty/gpu_coords_map/include/cuda_unordered_map.h"
+#include "3rdparty/gpu_coords_map/include/coordinate.h"
+
+#include "region.hpp"
+#include "types.hpp"
+
+namespace minkowski {
+
+using std::reference_wrapper;
+using std::set;
+using std::tuple;
+using std::vector;
+using std::shared_ptr;
+
+// TODO(ljm): enumerate and `DISPATCH` all possible combination
+// D = 3
+using CoordsToIndexMap_int_4_int_5_0 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 0>;
+
+using CoordsToIndexMap_int_4_int_5_1 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 1>;
+
+using CoordsToIndexMap_int_4_int_5_2 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 2>;
+
+using CoordsToIndexMap_int_4_int_5_3 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 3>;
+
+using CoordsToIndexMap_int_4_int_5_4 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 4>;
+
+using CoordsToIndexMap_int_4_int_5_5 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 5>;
+
+using CoordsToIndexMap_int_4_int_5_6 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 6>;
+
+using CoordsToIndexMap_int_4_int_5_7 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 7>;
+
+using CoordsToIndexMap_int_4_int_5_8 =
+    cuda::unordered_map<Coordinate<int, 4>, int, 5, 8>;
+
+// D = 4
+using CoordsToIndexMap_int_5_int_5_0 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 0>;
+
+using CoordsToIndexMap_int_5_int_5_1 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 1>;
+
+using CoordsToIndexMap_int_5_int_5_2 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 2>;
+
+using CoordsToIndexMap_int_5_int_5_3 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 3>;
+
+using CoordsToIndexMap_int_5_int_5_4 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 4>;
+
+using CoordsToIndexMap_int_5_int_5_5 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 5>;
+
+using CoordsToIndexMap_int_5_int_5_6 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 6>;
+
+using CoordsToIndexMap_int_5_int_5_7 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 7>;
+
+using CoordsToIndexMap_int_5_int_5_8 =
+    cuda::unordered_map<Coordinate<int, 5>, int, 5, 8>;
+
+// D = 5
+using CoordsToIndexMap_int_6_int_5_0 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 0>;
+
+using CoordsToIndexMap_int_6_int_5_1 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 1>;
+
+using CoordsToIndexMap_int_6_int_5_2 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 2>;
+
+using CoordsToIndexMap_int_6_int_5_3 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 3>;
+
+using CoordsToIndexMap_int_6_int_5_4 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 4>;
+
+using CoordsToIndexMap_int_6_int_5_5 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 5>;
+
+using CoordsToIndexMap_int_6_int_5_6 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 6>;
+
+using CoordsToIndexMap_int_6_int_5_7 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 7>;
+
+using CoordsToIndexMap_int_6_int_5_8 =
+    cuda::unordered_map<Coordinate<int, 6>, int, 5, 8>;
+
+// D = 6
+using CoordsToIndexMap_int_7_int_5_0 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 0>;
+
+using CoordsToIndexMap_int_7_int_5_1 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 1>;
+
+using CoordsToIndexMap_int_7_int_5_2 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 2>;
+
+using CoordsToIndexMap_int_7_int_5_3 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 3>;
+
+using CoordsToIndexMap_int_7_int_5_4 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 4>;
+
+using CoordsToIndexMap_int_7_int_5_5 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 5>;
+
+using CoordsToIndexMap_int_7_int_5_6 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 6>;
+
+using CoordsToIndexMap_int_7_int_5_7 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 7>;
+
+using CoordsToIndexMap_int_7_int_5_8 =
+    cuda::unordered_map<Coordinate<int, 7>, int, 5, 8>;
+
+
+using CoordsToIndexMapGPU = CoordsToIndexMap_int_4_int_5_5;
+
+template <typename MapType = CoordsToIndexMapGPU> struct GPUCoordsMap {
+  shared_ptr<MapType> map;
+  using key_type = typename MapType::key_type;
+  using value_type = typename MapType::value_type;
+  int nrows, ncols;
+
+  // Constructors
+  GPUCoordsMap(uint32_t map_size, float duplicate_factor=1.0,
+               uint32_t keys_per_bucket=62, const uint32_t device_id=0) {
+    /*
+    map->reserve(map_size, duplicate_factor,
+                keys_per_bucket, device_id);
+                */
+    map = std::make_shared<MapType>(map_size, duplicate_factor,
+                                    keys_per_bucket, device_id);
+  }
+
+  // Initializations
+  value_type
+  initialize_batch(const int* p_coords_,
+                   int* p_mapping_,
+                   int* p_inverse_mapping_,
+                   const int nrows_, const int ncols_,
+                   const bool force_remap = false,
+                   const bool return_inverse = false);
+
+  void get_coords(int* p_coords, int size);
+  void get_index_at_batch(int* p_out, int batch_index, int nrows_);
+  void get_index_per_batch(const vector<int*>& p_outs, int nrows_);
+  value_type
+  //region_insert(const GPUCoordsMap<MapType>& in_coords_map,
+  region_insert(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                const Region &region, int size);
+  value_type
+  region_insert_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                       const vector<int*>& p_ins,
+                       const vector<int*>& p_outs,
+                       const Region &region,
+                       int size);
+  void region_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                     const vector<int*>& p_ins,
+                     const vector<int*>& p_outs,
+                     const Region &region,
+                     int size);
+  value_type
+  batch_insert(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map, int size);
+  void batch_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                    int* p_in, int* p_out, int size);
+  value_type
+  stride_insert(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                const vector<int>& tensor_strides,
+                int size);
+  value_type
+  stride_insert_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                         int* p_in, int* p_out,
+                                         const vector<int>& tensor_strides,
+                                         int size);
+  void stride_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                     int* p_in, int* p_out,
+                     const vector<int>& tensor_strides,
+                     int size);
+  value_type
+  union_insert(
+    const vector<shared_ptr<GPUCoordsMap<MapType>>>& in_maps,
+    const vector<int>& in_coords_map_sizes);
+  value_type
+  union_insert_search(
+    const vector<shared_ptr<GPUCoordsMap<MapType>>>& in_maps,
+    const vector<int*>& p_ins, const vector<int*>& p_outs,
+    const vector<int>& in_coords_map_sizes);
+  void union_search(
+    const vector<shared_ptr<GPUCoordsMap<MapType>>>& in_maps,
+    const vector<int*>& p_ins, const vector<int*>& p_outs,
+    const vector<int>& in_coords_map_sizes);
+  value_type
+  prune_insert(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                      bool* p_keep, int keep_size,
+                                      int size);
+  value_type
+  prune_insert_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                           int* p_in, int* p_out,
+                                           bool* p_keep, int keep_size,
+                                           int size);
+  void
+  prune_search(const shared_ptr<GPUCoordsMap<MapType>>& in_coords_map,
+                                    int* p_in, int* p_out,
+                                    bool* p_keep, int keep_size,
+                                    int size);
+  size_t size() const {
+    ASSERT(map->Size() == nrows, "map->Size() should equal to nrows");
+    return nrows;
+  }
+};
+
+} // end namespace minkowski
+
+#endif // gpu coordsmap
diff --git a/src/gpu_memory_manager.hpp b/src/gpu_memory_manager.hpp
index 255c32da..8fb4a5e9 100644
--- a/src/gpu_memory_manager.hpp
+++ b/src/gpu_memory_manager.hpp
@@ -77,6 +77,7 @@ class GPUMemoryManager {
 
   pInOutMaps<int> copyInOutMapToGPU(const InOutMaps<int> &map);
 
+// TODO(ljm): support multi-thread here
   void clear_tmp() {
     for (auto p_buffer : tmp_vec_ptr) {
       cudaFree(p_buffer);
@@ -107,10 +108,14 @@ class GPUMemoryManager {
     }
     case PYTORCH: {
       // std::cout << "Malloc PYTORCH: " << device_id << std::endl;
+      std::cout << "not support currently: " << device_id << std::endl;
+      //
+      /*
       CUDA_CHECK(cudaSetDevice(device_id));
       p_buffer = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(
           size, at::cuda::getCurrentCUDAStream());
       persist_vec_ptr.push_back(p_buffer);
+      */
       break;
     }
     }
diff --git a/src/pooling_avg.cpp b/src/pooling_avg.cpp
index 9fafb9d4..9c5780bf 100644
--- a/src/pooling_avg.cpp
+++ b/src/pooling_avg.cpp
@@ -101,9 +101,9 @@ void AvgPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
                           at::Tensor offsets, py::object py_in_coords_key,
                           py::object py_out_coords_key,
                           py::object py_coords_manager, bool use_avg) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
-  const auto &in_out = p_coords_manager->getInOutMapsGPU(
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
+  const InOutMapKey map_key = p_coords_manager->getInOutMaps(
       tensor_strides, strides, kernel_sizes, dilations, region_type, offsets,
       py_in_coords_key, py_out_coords_key, false, true);
 
@@ -124,7 +124,10 @@ void AvgPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
   NonzeroAvgPoolingForwardKernelGPU<Dtype, int>(
       in_feat.template data<Dtype>(), in_feat.size(0),
       out_feat.template data<Dtype>(), out_nrows, num_nonzero_data,
-      in_feat.size(1), in_out.first, in_out.second, use_avg, handle,
+      in_feat.size(1),
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
+      use_avg, handle,
       at::cuda::getCurrentCUDAStream());
 }
 
@@ -136,8 +139,8 @@ void AvgPoolingBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat,
                            int region_type, py::object py_in_coords_key,
                            py::object py_out_coords_key,
                            py::object py_coords_manager, bool use_avg) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   const InOutMapKey map_key = p_coords_manager->getMapHashKey(
       tensor_strides, strides, kernel_sizes, dilations, region_type,
       py_in_coords_key, py_out_coords_key, false, true);
@@ -153,8 +156,8 @@ void AvgPoolingBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat,
       grad_in_feat.template data<Dtype>(), in_feat.size(0),
       grad_out_feat.template data<Dtype>(), grad_out_feat.size(0),
       num_nonzero.template data<Dtype>(), in_feat.size(1),
-      p_coords_manager->d_in_maps[map_key],
-      p_coords_manager->d_out_maps[map_key], use_avg,
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key], use_avg,
       at::cuda::getCurrentCUDAStream());
 }
 #endif
@@ -188,28 +191,28 @@ template void AvgPoolingBackwardCPU<CoordsToIndexMap, double>(
     py::object py_coords_manager, bool use_avg);
 
 #ifndef CPU_ONLY
-template void AvgPoolingForwardGPU<CoordsToIndexMap, float>(
+template void AvgPoolingForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager, bool use_avg);
 
-template void AvgPoolingForwardGPU<CoordsToIndexMap, double>(
+template void AvgPoolingForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager, bool use_avg);
 
-template void AvgPoolingBackwardGPU<CoordsToIndexMap, float>(
+template void AvgPoolingBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor num_nonzero, vector<int> tensor_strides, vector<int> strides,
     vector<int> kernel_sizes, vector<int> dilations, int region_type,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager, bool use_avg);
 
-template void AvgPoolingBackwardGPU<CoordsToIndexMap, double>(
+template void AvgPoolingBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor num_nonzero, vector<int> tensor_strides, vector<int> strides,
     vector<int> kernel_sizes, vector<int> dilations, int region_type,
diff --git a/src/pooling_avg.cu b/src/pooling_avg.cu
index 1f1865a2..5ad34ccb 100644
--- a/src/pooling_avg.cu
+++ b/src/pooling_avg.cu
@@ -116,8 +116,7 @@ template <typename Dtype, typename Itype>
 void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows,
                                        Dtype *d_out_feat, int out_nrows,
                                        Dtype *d_num_nonzero, int nchannel,
-                                       const pInOutMaps<Itype> &in_maps,
-                                       const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                                        bool use_avg, cusparseHandle_t cushandle,
                                        cudaStream_t stream) {
   int nmaps = 0;
@@ -129,7 +128,7 @@ void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows,
 
   // Copy all maps to one vector
   for (const auto &map : in_maps)
-    nmaps += map.size();
+    nmaps += map.size(0);
 
   /* Map prep */
   // Create d in map
@@ -139,10 +138,10 @@ void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows,
   d_out_map = d_scr + nmaps;     // n_maps
   d_csr_row = d_scr + 2 * nmaps; // out_nrows + 1
 
-  CUDA_CHECK(cudaMemcpy(d_in_map, in_maps[0].data(), nmaps * sizeof(int),
+  CUDA_CHECK(cudaMemcpy(d_in_map, in_maps[0].data<Itype>(), nmaps * sizeof(int),
                         cudaMemcpyDeviceToDevice));
 
-  CUDA_CHECK(cudaMemcpy(d_out_map, out_maps[0].data(), nmaps * sizeof(int),
+  CUDA_CHECK(cudaMemcpy(d_out_map, out_maps[0].data<Itype>(), nmaps * sizeof(int),
                         cudaMemcpyDeviceToDevice));
 
   /* sparse mm prep */
@@ -235,38 +234,40 @@ void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows,
 
 template void NonzeroAvgPoolingForwardKernelGPU<float, int32_t>(
     const float *d_in_feat, int in_nrows, float *d_out_feat, int out_nrows,
-    float *d_num_nonzero, int nchannel, const pInOutMaps<int32_t> &in_map,
-    const pInOutMaps<int32_t> &out_map, bool use_avg,
+    float *d_num_nonzero, int nchannel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    bool use_avg,
     cusparseHandle_t cushandle, cudaStream_t stream);
 
 template void NonzeroAvgPoolingForwardKernelGPU<double, int32_t>(
     const double *d_in_feat, int in_nrows, double *d_out_feat, int out_nrows,
-    double *d_num_nonzero, int nchannel, const pInOutMaps<int32_t> &in_map,
-    const pInOutMaps<int32_t> &out_map, bool use_avg,
+    double *d_num_nonzero, int nchannel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    bool use_avg,
     cusparseHandle_t cushandle, cudaStream_t stream);
 
 template <typename Dtype, typename Itype>
 void NonzeroAvgPoolingBackwardKernelGPU(
     Dtype *d_grad_in_feat, int in_nrows, const Dtype *d_grad_out_feat,
     int out_nrows, const Dtype *d_num_nonzero, int nchannel,
-    const pInOutMaps<Itype> &in_maps, const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     bool use_avg, cudaStream_t stream) {
   // d_grad_in_feat must be all set to 0
 
   int nmaps = 0;
   for (const auto &map : in_maps)
-    nmaps += map.size();
+    nmaps += map.size(0);
 
   if (use_avg) {
     set_gradient_nonzero_avg<Dtype>
         <<<GET_BLOCKS(nmaps * nchannel), CUDA_NUM_THREADS, 0, stream>>>(
             nmaps * nchannel, d_grad_out_feat, d_grad_in_feat, nchannel,
-            d_num_nonzero, in_maps[0].data(), out_maps[0].data());
+            d_num_nonzero, in_maps[0].data<Itype>(), out_maps[0].data<Itype>());
   } else {
     set_gradient_nonzero<Dtype>
         <<<GET_BLOCKS(nmaps * nchannel), CUDA_NUM_THREADS, 0, stream>>>(
             nmaps * nchannel, d_grad_out_feat, d_grad_in_feat, nchannel,
-            in_maps[0].data(), out_maps[0].data());
+            in_maps[0].data<Itype>(), out_maps[0].data<Itype>());
   }
 
   CUDA_CHECK(cudaGetLastError());
@@ -276,13 +277,13 @@ void NonzeroAvgPoolingBackwardKernelGPU(
 template void NonzeroAvgPoolingBackwardKernelGPU<float, int32_t>(
     float *d_grad_in_feat, int in_nrows, const float *d_grad_out_feat,
     int out_nrows, const float *d_num_nonzero, int nchannel,
-    const pInOutMaps<int32_t> &in_map, const pInOutMaps<int32_t> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     bool use_avg, cudaStream_t stream);
 
 template void NonzeroAvgPoolingBackwardKernelGPU<double, int32_t>(
     double *d_grad_in_feat, int in_nrows, const double *d_grad_out_feat,
     int out_nrows, const double *d_num_nonzero, int nchannel,
-    const pInOutMaps<int32_t> &in_map, const pInOutMaps<int32_t> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     bool use_avg, cudaStream_t stream);
 
 } // end namespace minkowski
diff --git a/src/pooling_avg.cuh b/src/pooling_avg.cuh
index 1ee83050..eeaeca12 100644
--- a/src/pooling_avg.cuh
+++ b/src/pooling_avg.cuh
@@ -27,6 +27,7 @@
 
 #include <array>
 #include <vector>
+#include <torch/extension.h>
 
 #include "gpu.cuh"
 #include "math_functions.hpp"
@@ -38,8 +39,7 @@ template <typename Dtype, typename Itype>
 void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows,
                                        Dtype *d_out_feat, int out_nrows,
                                        Dtype *d_num_nonzero, int nchannel,
-                                       const pInOutMaps<Itype> &in_map,
-                                       const pInOutMaps<Itype> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                                        bool use_avg, cusparseHandle_t cushandle,
                                        cudaStream_t stream);
 
@@ -47,7 +47,7 @@ template <typename Dtype, typename Itype>
 void NonzeroAvgPoolingBackwardKernelGPU(
     Dtype *d_grad_in_feat, int in_nrows, const Dtype *d_grad_out_feat,
     int out_nrows, const Dtype *d_num_nonzero, int nchannel,
-    const pInOutMaps<Itype> &in_map, const pInOutMaps<Itype> &out_map,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     bool use_avg, cudaStream_t stream);
 
 } // end namespace minkowski
diff --git a/src/pooling_global_avg.cpp b/src/pooling_global_avg.cpp
index 1c648aac..6f1feff3 100644
--- a/src/pooling_global_avg.cpp
+++ b/src/pooling_global_avg.cpp
@@ -134,8 +134,8 @@ vector<at::Tensor> GlobalPoolingForwardGPU(at::Tensor in_feat,
                                            py::object py_out_coords_key,
                                            py::object py_coords_manager,
                                            bool use_avg, int pooling_mode) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   const auto batch_size = p_coords_manager->getBatchSize();
 
   if (batch_size == 1) {
@@ -173,7 +173,7 @@ vector<at::Tensor> GlobalPoolingForwardGPU(at::Tensor in_feat,
       }
     } break;
     case 2: {
-      const auto &in_outs = p_coords_manager->getOriginInOutMapsGPU(
+      const InOutMapKey map_key = p_coords_manager->getOriginInOutMaps(
           py_in_coords_key, py_out_coords_key);
 
       cusparseHandle_t handle = at::cuda::getCurrentCUDASparseHandle();
@@ -182,8 +182,10 @@ vector<at::Tensor> GlobalPoolingForwardGPU(at::Tensor in_feat,
       NonzeroAvgPoolingForwardKernelGPU<Dtype, int>(
           in_feat.template data<Dtype>(), in_feat.size(0),
           out_feat.template data<Dtype>(), batch_size,
-          num_nonzero.template data<Dtype>(), in_feat.size(1), in_outs.first,
-          in_outs.second, use_avg, handle, at::cuda::getCurrentCUDAStream());
+          num_nonzero.template data<Dtype>(), in_feat.size(1),
+          p_coords_manager->in_maps[map_key],
+          p_coords_manager->out_maps[map_key],
+          use_avg, handle, at::cuda::getCurrentCUDAStream());
 
     } break;
     default:
@@ -199,8 +201,8 @@ GlobalPoolingBackwardGPU(at::Tensor in_feat, at::Tensor grad_out_feat,
                          at::Tensor num_nonzero, py::object py_in_coords_key,
                          py::object py_out_coords_key,
                          py::object py_coords_manager, bool use_avg) {
-  CoordsManager<MapType> *p_coords_man =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_man =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   const auto batch_size = p_coords_man->getBatchSize();
 
   auto grad_in_feat = torch::empty_like(in_feat);
@@ -218,15 +220,13 @@ GlobalPoolingBackwardGPU(at::Tensor in_feat, at::Tensor grad_out_feat,
         p_coords_man->existsInOutMapKey(map_key),
         "The in-out map doesn't exist for backward. Did you run forward pass?");
 
-    p_coords_man->copyInOutMapsToGPU(map_key);
-
     grad_in_feat.zero_();
 
     NonzeroAvgPoolingBackwardKernelGPU<Dtype, int>(
         grad_in_feat.template data<Dtype>(), in_feat.size(0),
         grad_out_feat.template data<Dtype>(), grad_out_feat.size(0),
         num_nonzero.template data<Dtype>(), in_feat.size(1),
-        p_coords_man->d_in_maps[map_key], p_coords_man->d_out_maps[map_key],
+        p_coords_man->in_maps[map_key], p_coords_man->out_maps[map_key],
         use_avg, at::cuda::getCurrentCUDAStream());
   }
   return grad_in_feat;
@@ -254,22 +254,22 @@ template at::Tensor GlobalPoolingBackwardCPU<CoordsToIndexMap, double>(
     py::object py_coords_manager, bool use_avg);
 
 #ifndef CPU_ONLY
-template vector<at::Tensor> GlobalPoolingForwardGPU<CoordsToIndexMap, float>(
+template vector<at::Tensor> GlobalPoolingForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, py::object py_in_coords_key,
     py::object py_out_coords_key, py::object py_coords_manager, bool use_avg,
     int pooling_mode);
 
-template vector<at::Tensor> GlobalPoolingForwardGPU<CoordsToIndexMap, double>(
+template vector<at::Tensor> GlobalPoolingForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, py::object py_in_coords_key,
     py::object py_out_coords_key, py::object py_coords_manager, bool use_avg,
     int pooling_mode);
 
-template at::Tensor GlobalPoolingBackwardGPU<CoordsToIndexMap, float>(
+template at::Tensor GlobalPoolingBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager, bool use_avg);
 
-template at::Tensor GlobalPoolingBackwardGPU<CoordsToIndexMap, double>(
+template at::Tensor GlobalPoolingBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager, bool use_avg);
diff --git a/src/pooling_global_max.cpp b/src/pooling_global_max.cpp
index dec5d937..09437fe0 100644
--- a/src/pooling_global_max.cpp
+++ b/src/pooling_global_max.cpp
@@ -86,9 +86,9 @@ void GlobalMaxPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
                                 py::object py_in_coords_key,
                                 py::object py_out_coords_key,
                                 py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
-  const auto &in_out = p_coords_manager->getOriginInOutMapsGPU(
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
+  const InOutMapKey map_key = p_coords_manager->getOriginInOutMaps(
       py_in_coords_key, py_out_coords_key);
 
   const int out_nrows = p_coords_manager->getCoordsSize(py_out_coords_key);
@@ -99,14 +99,16 @@ void GlobalMaxPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
   num_nonzero.zero_();
 
   // Compute the scratch space
-  const int nmap = getInOutMapsSize(in_out.first);
+  const int nmap = getInOutMapsSizeGPU(p_coords_manager->in_maps[map_key]);
   int *d_scr =
       (int *)p_coords_manager->getScratchGPUMemory(5 * nmap * sizeof(int));
 
   MaxPoolingForwardKernelGPU<Dtype, int>(
       in_feat.template data<Dtype>(), out_feat.template data<Dtype>(),
-      out_nrows, num_nonzero.template data<int>(), nchannel, get<0>(in_out),
-      get<1>(in_out), d_scr, at::cuda::getCurrentCUDAStream());
+      out_nrows, num_nonzero.template data<int>(), nchannel,
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
+      d_scr, at::cuda::getCurrentCUDAStream());
 
   p_coords_manager->clearScratchGPUMemory();
 }
@@ -150,22 +152,22 @@ template void GlobalMaxPoolingBackwardCPU<CoordsToIndexMap, double>(
     py::object py_out_coords_key, py::object py_coords_manager);
 
 #ifndef CPU_ONLY
-template void GlobalMaxPoolingForwardGPU<CoordsToIndexMap, float>(
+template void GlobalMaxPoolingForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void GlobalMaxPoolingForwardGPU<CoordsToIndexMap, double>(
+template void GlobalMaxPoolingForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void GlobalMaxPoolingBackwardGPU<CoordsToIndexMap, float>(
+template void GlobalMaxPoolingBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor num_nonzero, py::object py_in_coords_key,
     py::object py_out_coords_key, py::object py_coords_manager);
 
-template void GlobalMaxPoolingBackwardGPU<CoordsToIndexMap, double>(
+template void GlobalMaxPoolingBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor num_nonzero, py::object py_in_coords_key,
     py::object py_out_coords_key, py::object py_coords_manager);
diff --git a/src/pooling_max.cpp b/src/pooling_max.cpp
index 7ac88719..7d9a9c64 100644
--- a/src/pooling_max.cpp
+++ b/src/pooling_max.cpp
@@ -96,9 +96,9 @@ void MaxPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
                           at::Tensor offsets, py::object py_in_coords_key,
                           py::object py_out_coords_key,
                           py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
-  const auto &in_out = p_coords_manager->getInOutMapsGPU(
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
+  const InOutMapKey map_key = p_coords_manager->getInOutMaps(
       tensor_strides, strides, kernel_sizes, dilations, region_type, offsets,
       py_in_coords_key, py_out_coords_key, false, true);
 
@@ -110,14 +110,16 @@ void MaxPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
   num_nonzero.zero_();
 
   // Compute the scratch space
-  int nmap = getInOutMapsSize(in_out.first);
+  int nmap = getInOutMapsSizeGPU(p_coords_manager->in_maps[map_key]);
 
   int *d_scr =
       (int *)p_coords_manager->getScratchGPUMemory(5 * nmap * sizeof(int));
 
   MaxPoolingForwardKernelGPU<Dtype, int>(
       in_feat.template data<Dtype>(), out_feat.template data<Dtype>(),
-      out_nrows, num_nonzero.data<int>(), nchannel, in_out.first, in_out.second,
+      out_nrows, num_nonzero.data<int>(), nchannel,
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
       d_scr, at::cuda::getCurrentCUDAStream());
 
   p_coords_manager->clearScratchGPUMemory();
@@ -172,28 +174,28 @@ template void MaxPoolingBackwardCPU<CoordsToIndexMap, double>(
 
 #ifndef CPU_ONLY
 
-template void MaxPoolingForwardGPU<CoordsToIndexMap, float>(
+template void MaxPoolingForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void MaxPoolingForwardGPU<CoordsToIndexMap, double>(
+template void MaxPoolingForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void MaxPoolingBackwardGPU<CoordsToIndexMap, float>(
+template void MaxPoolingBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor num_nonzero, vector<int> tensor_strides, vector<int> strides,
     vector<int> kernel_sizes, vector<int> dilations, int region_type,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void MaxPoolingBackwardGPU<CoordsToIndexMap, double>(
+template void MaxPoolingBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor num_nonzero, vector<int> tensor_strides, vector<int> strides,
     vector<int> kernel_sizes, vector<int> dilations, int region_type,
diff --git a/src/pooling_max.cu b/src/pooling_max.cu
index 7861fe32..61e1424b 100644
--- a/src/pooling_max.cu
+++ b/src/pooling_max.cu
@@ -102,24 +102,24 @@ namespace minkowski {
 template <typename Dtype, typename Itype>
 void MaxPoolingForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat,
                                 int out_nrows, Itype *d_max_index, int nchannel,
-                                const pInOutMaps<Itype> &in_maps,
-                                const pInOutMaps<Itype> &out_maps, Itype *d_scr,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+                                Itype *d_scr,
                                 cudaStream_t stream) {
   int nmap = 0;
 
   // Copy all maps to one vector
   for (const auto &map : in_maps)
-    nmap += map.size();
+    nmap += map.size(0);
 
   Itype *d_in_map = d_scr, *d_out_map = d_scr + nmap;
 
   CUDA_CHECK(cudaMemcpy(d_in_map,
-                        in_maps[0].data(), // in_maps are contiguous of size nnz
+                        in_maps[0].data<Itype>(), // in_maps are contiguous of size nnz
                         nmap * sizeof(int), cudaMemcpyDeviceToDevice));
 
   CUDA_CHECK(
       cudaMemcpy(d_out_map,
-                 out_maps[0].data(), // out_maps are contiguous of size nnz
+                 out_maps[0].data<Itype>(), // out_maps are contiguous of size nnz
                  nmap * sizeof(int), cudaMemcpyDeviceToDevice));
 
   // First, sort d_out_map and d_in_map with the d_out_map so that in_feat are
@@ -171,13 +171,15 @@ void MaxPoolingForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat,
 
 template void MaxPoolingForwardKernelGPU<float, int32_t>(
     const float *d_in_feat, float *d_out_feat, int out_nrows,
-    int32_t *d_max_index, int nchannel, const pInOutMaps<int32_t> &in_map,
-    const pInOutMaps<int32_t> &out_map, int32_t *d_scr, cudaStream_t stream);
+    int32_t *d_max_index, int nchannel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    int32_t *d_scr, cudaStream_t stream);
 
 template void MaxPoolingForwardKernelGPU<double, int32_t>(
     const double *d_in_feat, double *d_out_feat, int out_nrows,
-    int32_t *d_max_index, int nchannel, const pInOutMaps<int32_t> &in_map,
-    const pInOutMaps<int32_t> &out_map, int32_t *d_scr, cudaStream_t stream);
+    int32_t *d_max_index, int nchannel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    int32_t *d_scr, cudaStream_t stream);
 
 template <typename Dtype, typename Itype>
 void MaxPoolingBackwardKernelGPU(Dtype *d_grad_in_feat, int in_nrows,
diff --git a/src/pooling_max.cuh b/src/pooling_max.cuh
index 684e4e8a..62066a69 100644
--- a/src/pooling_max.cuh
+++ b/src/pooling_max.cuh
@@ -27,6 +27,7 @@
 
 #include <array>
 #include <vector>
+#include <torch/extension.h>
 
 #include "gpu.cuh"
 #include "math_functions.hpp"
@@ -37,8 +38,8 @@ namespace minkowski {
 template <typename Dtype, typename Itype>
 void MaxPoolingForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat,
                                 int out_nrows, Itype *d_max_index, int nchannel,
-                                const pInOutMaps<Itype> &in_map,
-                                const pInOutMaps<Itype> &out_map, Itype *d_scr,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+                                Itype *d_scr,
                                 cudaStream_t stream);
 
 template <typename Dtype, typename Itype>
diff --git a/src/pooling_transpose.cpp b/src/pooling_transpose.cpp
index a32c6aa7..45a0f496 100644
--- a/src/pooling_transpose.cpp
+++ b/src/pooling_transpose.cpp
@@ -122,9 +122,9 @@ void PoolingTransposeForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
                                 py::object py_in_coords_key,
                                 py::object py_out_coords_key,
                                 py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
-  const auto &in_out = p_coords_manager->getInOutMapsGPU(
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
+  const InOutMapKey map_key = p_coords_manager->getInOutMaps(
       tensor_strides, strides, kernel_sizes, dilations, region_type, offsets,
       py_in_coords_key, py_out_coords_key, true, true);
 
@@ -140,8 +140,10 @@ void PoolingTransposeForwardGPU(at::Tensor in_feat, at::Tensor out_feat,
   NonzeroAvgPoolingForwardKernelGPU<Dtype, int>(
       in_feat.template data<Dtype>(), in_feat.size(0),
       out_feat.template data<Dtype>(), out_nrows,
-      num_nonzero.template data<Dtype>(), in_feat.size(1), get<0>(in_out),
-      get<1>(in_out), false, handle, at::cuda::getCurrentCUDAStream());
+      num_nonzero.template data<Dtype>(), in_feat.size(1),
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
+      false, handle, at::cuda::getCurrentCUDAStream());
 }
 
 template <typename MapType, typename Dtype>
@@ -151,8 +153,8 @@ void PoolingTransposeBackwardGPU(
     vector<int> kernel_sizes, vector<int> dilations, int region_type,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   bool reverse_map = false;
   const InOutMapKey rev_map_key = p_coords_manager->getMapHashKey(
       tensor_strides, strides, kernel_sizes, dilations, region_type,
@@ -171,29 +173,29 @@ void PoolingTransposeBackwardGPU(
 
   if (!reverse_map) {
     ASSERT(
-        p_coords_manager->d_in_maps.find(map_key) !=
-            p_coords_manager->d_in_maps.end(),
+        p_coords_manager->in_maps.find(map_key) !=
+            p_coords_manager->in_maps.end(),
         "The in-out map doesn't exist for backward. Did you run forward pass?");
 
     NonzeroAvgPoolingBackwardKernelGPU<Dtype, int>(
         grad_in_feat.template data<Dtype>(), in_feat.size(0),
         grad_out_feat.template data<Dtype>(), grad_out_feat.size(0),
         num_nonzero.template data<Dtype>(), in_feat.size(1),
-        p_coords_manager->d_in_maps[map_key],
-        p_coords_manager->d_out_maps[map_key], false,
+        p_coords_manager->in_maps[map_key],
+        p_coords_manager->out_maps[map_key], false,
         at::cuda::getCurrentCUDAStream());
   } else {
     ASSERT(
-        p_coords_manager->d_in_maps.find(rev_map_key) !=
-            p_coords_manager->d_in_maps.end(),
+        p_coords_manager->in_maps.find(rev_map_key) !=
+            p_coords_manager->in_maps.end(),
         "The in-out map doesn't exist for backward. Did you run forward pass?");
 
     NonzeroAvgPoolingBackwardKernelGPU<Dtype, int>(
         grad_in_feat.template data<Dtype>(), in_feat.size(0),
         grad_out_feat.template data<Dtype>(), grad_out_feat.size(0),
         num_nonzero.template data<Dtype>(), in_feat.size(1),
-        p_coords_manager->d_out_maps[rev_map_key],
-        p_coords_manager->d_in_maps[rev_map_key], false,
+        p_coords_manager->out_maps[rev_map_key],
+        p_coords_manager->in_maps[rev_map_key], false,
         at::cuda::getCurrentCUDAStream());
   }
 }
@@ -229,28 +231,28 @@ template void PoolingTransposeBackwardCPU<CoordsToIndexMap, double>(
 
 #ifndef CPU_ONLY
 
-template void PoolingTransposeForwardGPU<CoordsToIndexMap, float>(
+template void PoolingTransposeForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void PoolingTransposeForwardGPU<CoordsToIndexMap, double>(
+template void PoolingTransposeForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero,
     vector<int> tensor_strides, vector<int> strides, vector<int> kernel_sizes,
     vector<int> dilations, int region_type, at::Tensor offsets,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void PoolingTransposeBackwardGPU<CoordsToIndexMap, float>(
+template void PoolingTransposeBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor num_nonzero, vector<int> tensor_strides, vector<int> strides,
     vector<int> kernel_sizes, vector<int> dilations, int region_type,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void PoolingTransposeBackwardGPU<CoordsToIndexMap, double>(
+template void PoolingTransposeBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     at::Tensor num_nonzero, vector<int> tensor_strides, vector<int> strides,
     vector<int> kernel_sizes, vector<int> dilations, int region_type,
diff --git a/src/pruning.cpp b/src/pruning.cpp
index 5ca31e8a..98f7eeac 100644
--- a/src/pruning.cpp
+++ b/src/pruning.cpp
@@ -98,9 +98,9 @@ void PruningForwardGPU(at::Tensor in_feat,  // GPU feat
                        py::object py_in_coords_key,
                        py::object py_out_coords_key,
                        py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
-  const auto &in_out = p_coords_manager->getPruningInOutMapsGPU(
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
+  const InOutMapKey map_key = p_coords_manager->getPruningInOutMaps(
       use_feat, py_in_coords_key, py_out_coords_key);
 
   // Get the total number of coords
@@ -115,7 +115,9 @@ void PruningForwardGPU(at::Tensor in_feat,  // GPU feat
 
     PruningForwardKernelGPU<Dtype, int>(
         in_feat.template data<Dtype>(), out_feat.template data<Dtype>(),
-        in_feat.size(1), get<0>(in_out), get<1>(in_out),
+        in_feat.size(1),
+        p_coords_manager->in_maps[map_key],
+        p_coords_manager->out_maps[map_key],
         at::cuda::getCurrentCUDAStream());
   }
 }
@@ -126,14 +128,14 @@ void PruningBackwardGPU(at::Tensor grad_in_feat,  // GPU feat
                         py::object py_in_coords_key,
                         py::object py_out_coords_key,
                         py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
 
   const InOutMapKey map_key = p_coords_manager->getOriginMapHashKey(
       py_in_coords_key, py_out_coords_key);
 
-  ASSERT(p_coords_manager->d_in_maps.find(map_key) !=
-             p_coords_manager->d_in_maps.end(),
+  ASSERT(p_coords_manager->in_maps.find(map_key) !=
+             p_coords_manager->in_maps.end(),
          "The in-out map doesn't exist for backward. Did you run forward pass?")
 
   const int in_nrows = p_coords_manager->getCoordsSize(py_in_coords_key);
@@ -145,8 +147,8 @@ void PruningBackwardGPU(at::Tensor grad_in_feat,  // GPU feat
     PruningBackwardKernelGPU<Dtype, int>(grad_in_feat.template data<Dtype>(),
                                          grad_out_feat.template data<Dtype>(),
                                          nchannel,
-                                         p_coords_manager->d_in_maps[map_key],
-                                         p_coords_manager->d_out_maps[map_key],
+                                         p_coords_manager->in_maps[map_key],
+                                         p_coords_manager->out_maps[map_key],
                                          at::cuda::getCurrentCUDAStream());
   else
     WARNING(true, "MinkowskiPruning: Backprop from a size-0 sparse tensor.");
@@ -174,22 +176,22 @@ template void PruningBackwardCPU<CoordsToIndexMap, double>(
     py::object py_coords_manager);
 
 #ifndef CPU_ONLY
-template void PruningForwardGPU<CoordsToIndexMap, float>(
+template void PruningForwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor use_feat,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void PruningForwardGPU<CoordsToIndexMap, double>(
+template void PruningForwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor in_feat, at::Tensor out_feat, at::Tensor use_feat,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void PruningBackwardGPU<CoordsToIndexMap, float>(
+template void PruningBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
 
-template void PruningBackwardGPU<CoordsToIndexMap, double>(
+template void PruningBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor grad_in_feat, at::Tensor grad_out_feat,
     py::object py_in_coords_key, py::object py_out_coords_key,
     py::object py_coords_manager);
diff --git a/src/pruning.cu b/src/pruning.cu
index baff628e..6ec0aeb6 100644
--- a/src/pruning.cu
+++ b/src/pruning.cu
@@ -46,49 +46,47 @@ __global__ void copy_in_out_map(const int n, const Dtype *in_feat,
 template <typename Dtype, typename Itype>
 void PruningForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat,
                              const int nchannel,
-                             const pInOutMaps<Itype> &in_maps,
-                             const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                              cudaStream_t stream) {
-  const int nnz = in_maps[0].size();
+  const int nnz = in_maps[0].size(0);
 
   copy_in_out_map<Dtype, Itype>
       <<<GET_BLOCKS(nnz), CUDA_NUM_THREADS, 0, stream>>>(
-          nnz, d_in_feat, d_out_feat, nchannel, in_maps[0].data(),
-          out_maps[0].data());
+          nnz, d_in_feat, d_out_feat, nchannel, in_maps[0].data<Itype>(),
+          out_maps[0].data<Itype>());
 }
 
 template <typename Dtype, typename Itype>
 void PruningBackwardKernelGPU(Dtype *d_grad_in_feat,
                               const Dtype *d_grad_out_feat, int nchannel,
-                              const pInOutMaps<Itype> &in_maps,
-                              const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                               cudaStream_t stream) {
-  const int nnz = in_maps[0].size();
+  const int nnz = in_maps[0].size(0);
 
   copy_in_out_map<Dtype, Itype>
       <<<GET_BLOCKS(nnz), CUDA_NUM_THREADS, 0, stream>>>(
-          nnz, d_grad_out_feat, d_grad_in_feat, nchannel, out_maps[0].data(),
-          in_maps[0].data());
+          nnz, d_grad_out_feat, d_grad_in_feat, nchannel, out_maps[0].data<Itype>(),
+          in_maps[0].data<Itype>());
 }
 
 template void PruningForwardKernelGPU<float, int32_t>(
     const float *d_in_feat, float *d_out_feat, int nchannel,
-    const pInOutMaps<int32_t> &in_maps, const pInOutMaps<int32_t> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cudaStream_t stream);
 
 template void PruningBackwardKernelGPU<float, int32_t>(
     float *d_grad_in_feat, const float *d_grad_out_feat, int nchannel,
-    const pInOutMaps<int32_t> &in_maps, const pInOutMaps<int32_t> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cudaStream_t stream);
 
 template void PruningForwardKernelGPU<double, int32_t>(
     const double *d_in_feat, double *d_out_feat, int nchannel,
-    const pInOutMaps<int32_t> &in_maps, const pInOutMaps<int32_t> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cudaStream_t stream);
 
 template void PruningBackwardKernelGPU<double, int32_t>(
     double *d_grad_in_feat, const double *d_grad_out_feat, int nchannel,
-    const pInOutMaps<int32_t> &in_maps, const pInOutMaps<int32_t> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cudaStream_t stream);
 
 } // end namespace minkowski
diff --git a/src/pruning.cuh b/src/pruning.cuh
index 9ac6ed32..3399c288 100644
--- a/src/pruning.cuh
+++ b/src/pruning.cuh
@@ -27,6 +27,7 @@
 
 #include <array>
 #include <vector>
+#include <torch/extension.h>
 
 #include "types.hpp"
 
@@ -34,15 +35,14 @@ namespace minkowski {
 
 template <typename Dtype, typename Itype>
 void PruningForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat,
-                             int nchannel, const pInOutMaps<Itype> &in_maps,
-                             const pInOutMaps<Itype> &out_maps,
+                             int nchannel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                              cudaStream_t stream);
 
 template <typename Dtype, typename Itype>
 void PruningBackwardKernelGPU(Dtype *d_grad_in_feat,
                               const Dtype *d_grad_out_feat, int nchannel,
-                              const pInOutMaps<Itype> &in_maps,
-                              const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                               cudaStream_t stream);
 
 } // end namespace minkowski
diff --git a/src/quantization.cpp b/src/quantization.cpp
index a1060bf7..65cae94a 100644
--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -230,21 +230,6 @@ template <typename Dtype> InOutMaps<Dtype> CopyToInOutMap(at::Tensor th_map) {
   return vec_map;
 }
 
-#ifndef CPU_ONLY
-template <typename Dtype>
-pInOutMaps<Dtype> CopyToInOutMapGPU(at::Tensor th_map) {
-  pInOutMaps<Dtype> vec_map;
-
-  Dtype *d_scr;
-  CUDA_CHECK(cudaMalloc(&d_scr, th_map.size(0) * sizeof(Dtype)));
-  CUDA_CHECK(cudaMemcpy(d_scr, th_map.template data<Dtype>(),
-                        th_map.size(0) * sizeof(Dtype),
-                        cudaMemcpyHostToDevice));
-  vec_map.push_back(pVector<Dtype>(d_scr, th_map.size(0)));
-  return vec_map;
-}
-#endif
-
 /**
  * A collection of feature averaging methods
  * mode == 0: non-weighted average
@@ -278,22 +263,19 @@ at::Tensor quantization_average_features(
   if (th_in_map.dtype() == torch::kInt64) {
     if (th_in_feat.is_cuda()) {
 #ifndef CPU_ONLY
-      auto vec_in_map = CopyToInOutMapGPU<int>(th_in_map);
-      auto vec_out_map = CopyToInOutMapGPU<int>(th_out_map);
-
       if (th_in_feat.dtype() == torch::kFloat32) {
         NonzeroAvgPoolingForwardKernelGPU<float, int>(
             th_in_feat.template data<float>(), th_in_feat.size(0),
             th_out_feat.template data<float>(), out_nrows,
             th_num_nonzero.template data<float>(), th_in_feat.size(1),
-            vec_in_map, vec_out_map, true, handle,
+            {th_in_map}, {th_out_map}, true, handle,
             at::cuda::getCurrentCUDAStream());
       } else if (th_in_feat.dtype() == torch::kFloat64) {
         NonzeroAvgPoolingForwardKernelGPU<float, int>(
             th_in_feat.template data<float>(), th_in_feat.size(0),
             th_out_feat.template data<float>(), out_nrows,
             th_num_nonzero.template data<float>(), th_in_feat.size(1),
-            vec_in_map, vec_out_map, true, handle,
+            {th_in_map}, {th_out_map}, true, handle,
             at::cuda::getCurrentCUDAStream());
       } else {
         throw std::runtime_error("Dtype not supported.");
@@ -324,22 +306,19 @@ at::Tensor quantization_average_features(
   } else if (th_in_map.dtype() == torch::kInt32) {
     if (th_in_feat.is_cuda()) {
 #ifndef CPU_ONLY
-      auto vec_in_map = CopyToInOutMapGPU<int>(th_in_map);
-      auto vec_out_map = CopyToInOutMapGPU<int>(th_out_map);
-
       if (th_in_feat.dtype() == torch::kFloat32) {
         NonzeroAvgPoolingForwardKernelGPU<float, int>(
             th_in_feat.template data<float>(), th_in_feat.size(0),
             th_out_feat.template data<float>(), out_nrows,
             th_num_nonzero.template data<float>(), th_in_feat.size(1),
-            vec_in_map, vec_out_map, true, handle,
+            {th_in_map}, {th_out_map}, true, handle,
             at::cuda::getCurrentCUDAStream());
       } else if (th_in_feat.dtype() == torch::kFloat64) {
         NonzeroAvgPoolingForwardKernelGPU<float, int>(
             th_in_feat.template data<float>(), th_in_feat.size(0),
             th_out_feat.template data<float>(), out_nrows,
             th_num_nonzero.template data<float>(), th_in_feat.size(1),
-            vec_in_map, vec_out_map, true, handle,
+            {th_in_map}, {th_out_map}, true, handle,
             at::cuda::getCurrentCUDAStream());
       } else {
         throw std::runtime_error("Dtype not supported.");
diff --git a/src/region.hpp b/src/region.hpp
index 921f92ce..27912055 100644
--- a/src/region.hpp
+++ b/src/region.hpp
@@ -47,6 +47,7 @@ class RegionIterator {
   vector<int> operator*() { return point; };
 };
 
+// TODO(ljm): remove stride, stride will not affect region
 class Region {
 public:
   Region(const Region &region_);
diff --git a/src/union.cpp b/src/union.cpp
index 5dd933f2..d4336df9 100644
--- a/src/union.cpp
+++ b/src/union.cpp
@@ -119,8 +119,8 @@ at::Tensor UnionForwardGPU(vector<at::Tensor> in_feats,
                            vector<py::object> py_in_coords_keys,
                            py::object py_out_coords_key,
                            py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   // Basic assertions
   ASSERT(in_feats.size() > 1, "The number of input tensors must be > 1.");
   const size_t n_in = in_feats.size();
@@ -139,7 +139,7 @@ at::Tensor UnionForwardGPU(vector<at::Tensor> in_feats,
   }
 
   // Create new out map and get the in-out map
-  const auto in_out = p_coords_manager->getUnionInOutMapsGPU(py_in_coords_keys,
+  const InOutMapKey map_key = p_coords_manager->getUnionInOutMaps(py_in_coords_keys,
                                                              py_out_coords_key);
 
   // Out feat memory alloc
@@ -155,7 +155,9 @@ at::Tensor UnionForwardGPU(vector<at::Tensor> in_feats,
 
   UnionForwardKernelGPU<Dtype, int>(
       p_in_feats, out_feat.template data<Dtype>(), in_feats[0].size(1),
-      in_out.first, in_out.second, at::cuda::getCurrentCUDAStream());
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key],
+      at::cuda::getCurrentCUDAStream());
 
   return out_feat;
 }
@@ -164,8 +166,8 @@ template <typename MapType, typename Dtype>
 vector<at::Tensor>
 UnionBackwardGPU(at::Tensor grad_out_feat, vector<py::object> py_in_coords_keys,
                  py::object py_out_coords_key, py::object py_coords_manager) {
-  CoordsManager<MapType> *p_coords_manager =
-      py_coords_manager.cast<CoordsManager<MapType> *>();
+  GPUCoordsManager<MapType> *p_coords_manager =
+      py_coords_manager.cast<GPUCoordsManager<MapType> *>();
   const int nchannel = grad_out_feat.size(1);
   const size_t n_in = py_in_coords_keys.size();
 
@@ -192,8 +194,8 @@ UnionBackwardGPU(at::Tensor grad_out_feat, vector<py::object> py_in_coords_keys,
 
   UnionBackwardKernelGPU<Dtype, int>(
       p_grad_in_feats, grad_out_feat.template data<Dtype>(), nchannel,
-      p_coords_manager->d_in_maps[map_key],
-      p_coords_manager->d_out_maps[map_key], at::cuda::getCurrentCUDAStream());
+      p_coords_manager->in_maps[map_key],
+      p_coords_manager->out_maps[map_key], at::cuda::getCurrentCUDAStream());
 
   return grad_in_feats;
 }
@@ -216,19 +218,19 @@ template vector<at::Tensor> UnionBackwardCPU<CoordsToIndexMap, double>(
     py::object py_out_coords_key, py::object py_coords_manager);
 
 #ifndef CPU_ONLY
-template at::Tensor UnionForwardGPU<CoordsToIndexMap, float>(
+template at::Tensor UnionForwardGPU<CoordsToIndexMapGPU, float>(
     vector<at::Tensor> in_feats, vector<py::object> py_in_coords_keys,
     py::object py_out_coords_key, py::object py_coords_manager);
 
-template at::Tensor UnionForwardGPU<CoordsToIndexMap, double>(
+template at::Tensor UnionForwardGPU<CoordsToIndexMapGPU, double>(
     vector<at::Tensor> in_feats, vector<py::object> py_in_coords_keys,
     py::object py_out_coords_key, py::object py_coords_manager);
 
-template vector<at::Tensor> UnionBackwardGPU<CoordsToIndexMap, float>(
+template vector<at::Tensor> UnionBackwardGPU<CoordsToIndexMapGPU, float>(
     at::Tensor grad_out_feat, vector<py::object> py_in_coords_keys,
     py::object py_out_coords_key, py::object py_coords_manager);
 
-template vector<at::Tensor> UnionBackwardGPU<CoordsToIndexMap, double>(
+template vector<at::Tensor> UnionBackwardGPU<CoordsToIndexMapGPU, double>(
     at::Tensor grad_out_feat, vector<py::object> py_in_coords_key,
     py::object py_out_coords_key, py::object py_coords_manager);
 
diff --git a/src/union.cu b/src/union.cu
index c23f672a..ebd3058a 100644
--- a/src/union.cu
+++ b/src/union.cu
@@ -23,7 +23,7 @@
  *  of the code.
  */
 #include "gpu.cuh"
-#include "pruning.cuh"
+#include "union.cuh"
 
 namespace minkowski {
 
@@ -62,53 +62,53 @@ __global__ void copy_in_out_map(const int n, const Dtype *in_feat,
 
 template <typename Dtype, typename Itype>
 void UnionForwardKernelGPU(const vector<Dtype *> d_in_feats, Dtype *d_out_feat,
-                           const int nchannel, const pInOutMaps<Itype> &in_maps,
-                           const pInOutMaps<Itype> &out_maps,
+                           const int nchannel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                            cudaStream_t stream) {
 
   for (size_t k = 0; k < in_maps.size(); k++) {
-    const size_t nnz = in_maps[k].size();
+    const size_t nnz = in_maps[k].size(0);
     add_in_out_map<Dtype, Itype>
         <<<GET_BLOCKS(nnz), CUDA_NUM_THREADS, 0, stream>>>(
-            nnz, d_in_feats[k], d_out_feat, nchannel, in_maps[k].data(),
-            out_maps[k].data());
+            nnz, d_in_feats[k], d_out_feat, nchannel, in_maps[k].data<Itype>(),
+            out_maps[k].data<Itype>());
   }
 }
 
 template <typename Dtype, typename Itype>
 void UnionBackwardKernelGPU(vector<Dtype *> d_grad_in_feats,
                             const Dtype *d_grad_out_feat, int nchannel,
-                            const pInOutMaps<Itype> &in_maps,
-                            const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                             cudaStream_t stream) {
 
   for (size_t k = 0; k < in_maps.size(); k++) {
-    const int nnz = in_maps[k].size();
+    const int nnz = in_maps[k].size(0);
     copy_in_out_map<Dtype, Itype>
         <<<GET_BLOCKS(nnz), CUDA_NUM_THREADS, 0, stream>>>(
             nnz, d_grad_out_feat, d_grad_in_feats[k], nchannel,
-            out_maps[k].data(), in_maps[k].data());
+            out_maps[k].data<Itype>(), in_maps[k].data<Itype>());
   }
 }
 
 template void UnionForwardKernelGPU<float, int32_t>(
     const vector<float *> d_in_feats, float *d_out_feat, int nchannel,
-    const pInOutMaps<int32_t> &in_maps, const pInOutMaps<int32_t> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cudaStream_t stream);
 
 template void UnionBackwardKernelGPU<float, int32_t>(
     vector<float *> d_grad_in_feats, const float *d_grad_out_feat, int nchannel,
-    const pInOutMaps<int32_t> &in_maps, const pInOutMaps<int32_t> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cudaStream_t stream);
 
 template void UnionForwardKernelGPU<double, int32_t>(
     const vector<double *> d_in_feats, double *d_out_feat, int nchannel,
-    const pInOutMaps<int32_t> &in_maps, const pInOutMaps<int32_t> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
     cudaStream_t stream);
 
 template void UnionBackwardKernelGPU<double, int32_t>(
     vector<double *> d_grad_in_feats, const double *d_grad_out_feat,
-    int nchannel, const pInOutMaps<int32_t> &in_maps,
-    const pInOutMaps<int32_t> &out_maps, cudaStream_t stream);
+    int nchannel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
+    cudaStream_t stream);
 
 } // end namespace minkowski
diff --git a/src/union.cuh b/src/union.cuh
index af4719f8..ceaeb98e 100644
--- a/src/union.cuh
+++ b/src/union.cuh
@@ -27,6 +27,7 @@
 
 #include <array>
 #include <vector>
+#include <torch/extension.h>
 
 #include "types.hpp"
 
@@ -34,15 +35,14 @@ namespace minkowski {
 
 template <typename Dtype, typename Itype>
 void UnionForwardKernelGPU(const vector<Dtype *> d_in_feats, Dtype *d_out_feat,
-                           int nchannel, const pInOutMaps<Itype> &in_maps,
-                           const pInOutMaps<Itype> &out_maps,
+                           int nchannel,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                            cudaStream_t stream);
 
 template <typename Dtype, typename Itype>
 void UnionBackwardKernelGPU(vector<Dtype *> d_grad_in_feats,
                             const Dtype *d_grad_out_feat, int nchannel,
-                            const pInOutMaps<Itype> &in_maps,
-                            const pInOutMaps<Itype> &out_maps,
+    const vector<at::Tensor>& in_maps, const vector<at::Tensor>& out_maps,
                             cudaStream_t stream);
 
 } // namespace minkowski