diff --git a/Makefile b/Makefile index e14c51ea..62d480a1 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ Q ?= @ # CPU_ONLY := 1 CXX ?= g++ -PYTHON ?= python +PYTHON ?= python3.6 EXTENSION_NAME := minkowski @@ -66,17 +66,27 @@ ifneq ($(CPU_ONLY), 1) endif SRC_DIR := ./src +SRC_GPU_COORDS_MAP_DIR := ./src/3rdparty/gpu_coords_map/include +SRC_SLAB_HASH_DIR := ./src/3rdparty/gpu_coords_map/include/slab_hash OBJ_DIR := ./objs CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp) +CPP_SRCS_GPU_COORDS_MAP := $(wildcard $(SRC_GPU_COORDS_MAP_DIR)/*.cpp) +CPP_SRCS_SLAB_HASH:= $(wildcard $(SRC_SLAB_HASH_DIR)/*.cpp) CU_SRCS := $(wildcard $(SRC_DIR)/*.cu) +CU_SRCS_GPU_COORDS_MAP := $(wildcard $(SRC_GPU_COORDS_MAP_DIR)/*.cu) +CU_SRCS_SLAB_HASH:= $(wildcard $(SRC_SLAB_HASH_DIR)/*.cu) OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS)) +OBJS_GPU_COORDS_MAP := $(patsubst $(SRC_GPU_COORDS_MAP_DIR)/%.cpp,$(OBJ_DIR)/3rdparty/gpu_coords_map/include/%.o,$(CPP_SRCS_GPU_COORDS_MAP)) +OBJS_SLAB_HASH := $(patsubst $(SRC_SLAB_HASH_DIR)/%.cpp,$(OBJ_DIR)/3rdparty/gpu_coords_map/include/slab_hash/%.o,$(CPP_SRCS_SLAB_HASH)) CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS)) +CU_OBJS_GPU_COORDS_MAP := $(patsubst $(SRC_GPU_COORDS_MAP_DIR)/%.cu,$(OBJ_DIR)/3rdparty/gpu_coords_map/include/cuda/%.o,$(CU_SRCS_GPU_COORDS_MAP)) +CU_OBJS_SLAB_HASH := $(patsubst $(SRC_SLAB_HASH_DIR)/%.cu,$(OBJ_DIR)/3rdparty/gpu_coords_map/include/slab_hash/cuda/%.o,$(CU_SRCS_SLAB_HASH)) STATIC_LIB := $(OBJ_DIR)/lib$(EXTENSION_NAME).a # We will also explicitly add stdc++ to the link target. LIBRARIES := stdc++ c10 caffe2 torch torch_python _C ifneq ($(CPU_ONLY), 1) - LIBRARIES += cudart cublas cusparse caffe2_gpu c10_cuda + LIBRARIES += cudadevrt cudart cudadevrt cublas cudadevrt cusparse cudadevrt caffe2_gpu cudadevrt c10_cuda cudadevrt CUDA_ARCH := -gencode arch=compute_30,code=sm_30 \ -gencode arch=compute_35,code=sm_35 \ -gencode=arch=compute_50,code=sm_50 \ @@ -118,6 +128,7 @@ ifeq ($(DEBUG), 1) COMMON_FLAGS += -DDEBUG -g -O0 # https://gcoe-dresden.de/reaching-the-shore-with-a-fog-warning-my-eurohack-day-4-morning-session/ NVCCFLAGS := -g -G # -rdc true + # NVCCFLAGS := -g -G -rdc true else COMMON_FLAGS += -DNDEBUG -O3 endif @@ -140,6 +151,7 @@ COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \ CXXFLAGS += -fopenmp -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS) NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS) +NVCCFLAGS += -rdc true LINKFLAGS += -pthread -fPIC $(WARNINGS) -Wl,-rpath=$(PYTHON_LIB_DIR) -Wl,--no-as-needed -Wl,--sysroot=/ LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \ $(foreach library,$(LIBRARIES),-l$(library)) @@ -148,7 +160,7 @@ ifeq ($(CPU_ONLY), 1) ALL_OBJS := $(OBJS) CXXFLAGS += -DCPU_ONLY else - ALL_OBJS := $(OBJS) $(CU_OBJS) + ALL_OBJS := $(OBJS) $(OBJS_GPU_COORDS_MAP) $(OBJS_SLAB_HASH) $(CU_OBJS) $(CU_OBJS_GPU_COORDS_MAP) $(CU_OBJS_SLAB_HASH) endif all: $(STATIC_LIB) @@ -157,8 +169,19 @@ all: $(STATIC_LIB) $(OBJ_DIR): @ mkdir -p $@ @ mkdir -p $@/cuda + @ mkdir -p $@/3rdparty/gpu_coords_map/include/cuda + @ mkdir -p $@/3rdparty/gpu_coords_map/include/slab_hash/cuda $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR) + @ echo CXX $< + @ echo $(CXXFLAGS) + $(Q)$(CXX) $< $(CXXFLAGS) -c -o $@ + +$(OBJ_DIR)/3rdparty/gpu_coords_map/include/%.o: $(SRC_GPU_COORDS_MAP_DIR)/%.cpp | $(OBJ_DIR) + @ echo CXX $< + $(Q)$(CXX) $< $(CXXFLAGS) -c -o $@ + +$(OBJ_DIR)/3rdparty/gpu_coords_map/include/slab_hash/%.o: $(SRC_SLAB_HASH_DIR)/%.cpp | $(OBJ_DIR) @ echo CXX $< $(Q)$(CXX) $< $(CXXFLAGS) -c -o $@ @@ -168,8 +191,23 @@ $(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR) -odir $(@D) $(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ +$(OBJ_DIR)/3rdparty/gpu_coords_map/include/cuda/%.o: $(SRC_GPU_COORDS_MAP_DIR)/%.cu | $(OBJ_DIR) + @ echo NVCC $< + $(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \ + -odir $(@D) + $(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ + +$(OBJ_DIR)/3rdparty/gpu_coords_map/include/slab_hash/cuda/%.o: $(SRC_SLAB_HASH_DIR)/%.cu| $(OBJ_DIR) + @ echo NVCC $< + $(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \ + -odir $(@D) + $(Q)$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ + $(STATIC_LIB): $(ALL_OBJS) | $(OBJ_DIR) $(RM) -f $(STATIC_LIB) + @ echo $(LINKFLAGS) + @ echo $(LDFLAGS) + @ echo $(CXXFLAGS) @ echo LD -o $@ ar rc $(STATIC_LIB) $(ALL_OBJS) diff --git a/MinkowskiEngine/MinkowskiCoords.py b/MinkowskiEngine/MinkowskiCoords.py index 38ff118c..4a258f9f 100644 --- a/MinkowskiEngine/MinkowskiCoords.py +++ b/MinkowskiEngine/MinkowskiCoords.py @@ -35,7 +35,8 @@ if 'OMP_NUM_THREADS' in os.environ: CPU_COUNT = int(os.environ['OMP_NUM_THREADS']) -_memory_manager_backend = MemoryManagerBackend.PYTORCH +#_memory_manager_backend = MemoryManagerBackend.PYTORCH +_memory_manager_backend = MemoryManagerBackend.CUDA def set_memory_manager_backend(backend: MemoryManagerBackend): @@ -102,7 +103,8 @@ class CoordsManager(): def __init__(self, num_threads: int = -1, memory_manager_backend: MemoryManagerBackend = None, - D: int = -1): + D: int = -1, + device: str = 'cuda'): if D < 1: raise ValueError(f"Invalid dimension {D}") self.D = D @@ -111,7 +113,9 @@ def __init__(self, if memory_manager_backend is None: global _memory_manager_backend memory_manager_backend = _memory_manager_backend - coords_man = MEB.CoordsManager(num_threads, memory_manager_backend) + coords_man = MEB.CoordsManager(num_threads, memory_manager_backend) \ + if device == 'cpu' else \ + MEB.GPUCoordsManager(D, 0, memory_manager_backend) self.CPPCoordsManager = coords_man def initialize(self, @@ -120,14 +124,15 @@ def initialize(self, force_creation: bool = False, force_remap: bool = False, allow_duplicate_coords: bool = False, - return_inverse: bool = False) -> torch.LongTensor: + return_inverse: bool = False) -> torch.IntTensor: assert isinstance(coords_key, CoordsKey) - unique_index = torch.LongTensor() - inverse_mapping = torch.LongTensor() + # TODO(ljm): Adjust cpu interface from long to int acoordingly + unique_index = torch.IntTensor() + inverse_mapping = torch.IntTensor() self.CPPCoordsManager.initializeCoords( coords, unique_index, inverse_mapping, coords_key.CPPCoordsKey, force_creation, force_remap, allow_duplicate_coords, return_inverse) - return unique_index, inverse_mapping + return unique_index.long(), inverse_mapping.long() def create_coords_key(self, coords: torch.IntTensor, @@ -171,6 +176,9 @@ def stride(self, def reduce(self): origin_key = CoordsKey(self.D) origin_key.setTensorStride(convert_to_int_list(0, self.D)) + # TODO(ljm): Get batch_size by createOriginCoords + # TODO(ljm): Find a better way to get batch_size + # Notice(ljm): It can be concluded that the batch indices are contigous by GetCoordsAt origin_key.setKey(self.CPPCoordsManager.createOriginCoords(self.D)) return origin_key @@ -322,6 +330,9 @@ def get_kernel_map(self, is_transpose, is_pool) + kernel_map[0] = kernel_map[0].long() + kernel_map[1] = kernel_map[1].long() + return kernel_map def get_coords_map(self, in_key_or_tensor_strides, diff --git a/MinkowskiEngine/SparseTensor.py b/MinkowskiEngine/SparseTensor.py index a5e22fd7..6d285779 100644 --- a/MinkowskiEngine/SparseTensor.py +++ b/MinkowskiEngine/SparseTensor.py @@ -229,6 +229,7 @@ def __init__( of the current sparse tensor. By default, it is 1. """ + print(coords) assert isinstance(feats, torch.Tensor), "Features must be a torch.Tensor" assert feats.ndim == 2, f"The feature should be a matrix, The input feature is an order-{feats.ndim} tensor." @@ -254,12 +255,14 @@ def __init__( assert isinstance(coords, torch.Tensor), \ "Coordinate must be of type torch.Tensor" + print(isinstance(coords, torch.IntTensor)) if not isinstance(coords, torch.IntTensor): warnings.warn( 'Coords implicitly converted to torch.IntTensor. ' + 'To remove this warning, use `.int()` to convert the ' + 'coords into an torch.IntTensor') - coords = torch.floor(coords).int() + print(isinstance(coords, torch.IntTensor)) +# coords = torch.floor(coords).int() if coords.device.type != 'cpu': warnings.warn( @@ -283,7 +286,9 @@ def __init__( if _global_coords_man is None: _global_coords_man = CoordsManager( memory_manager_backend=memory_manager_backend, - D=coords.size(1) - 1) + D=coords.size(1) - 1, + device=coords.device.type if coords is not None else 'cuda') + # TODO(ljm): handle device when coords is None coords_manager = _global_coords_man else: assert coords is not None, "Initial coordinates must be given" diff --git a/pybind/minkowski.cpp b/pybind/minkowski.cpp index 9c2ec7b1..7442870d 100644 --- a/pybind/minkowski.cpp +++ b/pybind/minkowski.cpp @@ -31,11 +31,13 @@ #include "extern.hpp" #include "src/common.hpp" +#include "src/types.hpp" namespace py = pybind11; namespace mink = minkowski; +/* template void instantiate_func(py::module &m, const std::string &dtypestr) { m.def((std::string("ConvolutionForwardCPU") + dtypestr).c_str(), @@ -188,6 +190,153 @@ void instantiate_func(py::module &m, const std::string &dtypestr) { py::call_guard()); #endif } +*/ + +template +void instantiate_func(py::module &m, const std::string &dtypestr) { + m.def((std::string("ConvolutionForwardCPU") + dtypestr).c_str(), + &mink::ConvolutionForwardCPU, + py::call_guard()); + m.def((std::string("ConvolutionBackwardCPU") + dtypestr).c_str(), + &mink::ConvolutionBackwardCPU, + py::call_guard()); + + m.def((std::string("ConvolutionTransposeForwardCPU") + dtypestr).c_str(), + &mink::ConvolutionTransposeForwardCPU, + py::call_guard()); + m.def((std::string("ConvolutionTransposeBackwardCPU") + dtypestr).c_str(), + &mink::ConvolutionTransposeBackwardCPU, + py::call_guard()); + + m.def((std::string("AvgPoolingForwardCPU") + dtypestr).c_str(), + &mink::AvgPoolingForwardCPU, + py::call_guard()); + m.def((std::string("AvgPoolingBackwardCPU") + dtypestr).c_str(), + &mink::AvgPoolingBackwardCPU, + py::call_guard()); + + m.def((std::string("MaxPoolingForwardCPU") + dtypestr).c_str(), + &mink::MaxPoolingForwardCPU, + py::call_guard()); + m.def((std::string("MaxPoolingBackwardCPU") + dtypestr).c_str(), + &mink::MaxPoolingBackwardCPU, + py::call_guard()); + + m.def((std::string("PoolingTransposeForwardCPU") + dtypestr).c_str(), + &mink::PoolingTransposeForwardCPU, + py::call_guard()); + m.def((std::string("PoolingTransposeBackwardCPU") + dtypestr).c_str(), + &mink::PoolingTransposeBackwardCPU, + py::call_guard()); + + m.def((std::string("GlobalPoolingForwardCPU") + dtypestr).c_str(), + &mink::GlobalPoolingForwardCPU, + py::call_guard()); + m.def((std::string("GlobalPoolingBackwardCPU") + dtypestr).c_str(), + &mink::GlobalPoolingBackwardCPU, + py::call_guard()); + + m.def((std::string("GlobalMaxPoolingForwardCPU") + dtypestr).c_str(), + &mink::GlobalMaxPoolingForwardCPU, + py::call_guard()); + m.def((std::string("GlobalMaxPoolingBackwardCPU") + dtypestr).c_str(), + &mink::GlobalMaxPoolingBackwardCPU, + py::call_guard()); + + m.def((std::string("BroadcastForwardCPU") + dtypestr).c_str(), + &mink::BroadcastForwardCPU, + py::call_guard()); + m.def((std::string("BroadcastBackwardCPU") + dtypestr).c_str(), + &mink::BroadcastBackwardCPU, + py::call_guard()); + + m.def((std::string("PruningForwardCPU") + dtypestr).c_str(), + &mink::PruningForwardCPU, + py::call_guard()); + m.def((std::string("PruningBackwardCPU") + dtypestr).c_str(), + &mink::PruningBackwardCPU, + py::call_guard()); + + m.def((std::string("UnionForwardCPU") + dtypestr).c_str(), + &mink::UnionForwardCPU, + py::call_guard()); + m.def((std::string("UnionBackwardCPU") + dtypestr).c_str(), + &mink::UnionBackwardCPU, + py::call_guard()); +} + +template +void instantiate_func_gpu(py::module &m, const std::string &dtypestr) { + m.def((std::string("ConvolutionForwardGPU") + dtypestr).c_str(), + &mink::ConvolutionForwardGPU, + py::call_guard()); + m.def((std::string("ConvolutionBackwardGPU") + dtypestr).c_str(), + &mink::ConvolutionBackwardGPU, + py::call_guard()); + + m.def((std::string("ConvolutionTransposeForwardGPU") + dtypestr).c_str(), + &mink::ConvolutionTransposeForwardGPU, + py::call_guard()); + m.def((std::string("ConvolutionTransposeBackwardGPU") + dtypestr).c_str(), + &mink::ConvolutionTransposeBackwardGPU, + py::call_guard()); + + m.def((std::string("AvgPoolingForwardGPU") + dtypestr).c_str(), + &mink::AvgPoolingForwardGPU, + py::call_guard()); + m.def((std::string("AvgPoolingBackwardGPU") + dtypestr).c_str(), + &mink::AvgPoolingBackwardGPU, + py::call_guard()); + + m.def((std::string("MaxPoolingForwardGPU") + dtypestr).c_str(), + &mink::MaxPoolingForwardGPU, + py::call_guard()); + m.def((std::string("MaxPoolingBackwardGPU") + dtypestr).c_str(), + &mink::MaxPoolingBackwardGPU, + py::call_guard()); + + m.def((std::string("PoolingTransposeForwardGPU") + dtypestr).c_str(), + &mink::PoolingTransposeForwardGPU, + py::call_guard()); + m.def((std::string("PoolingTransposeBackwardGPU") + dtypestr).c_str(), + &mink::PoolingTransposeBackwardGPU, + py::call_guard()); + + m.def((std::string("GlobalPoolingForwardGPU") + dtypestr).c_str(), + &mink::GlobalPoolingForwardGPU, + py::call_guard()); + m.def((std::string("GlobalPoolingBackwardGPU") + dtypestr).c_str(), + &mink::GlobalPoolingBackwardGPU, + py::call_guard()); + + m.def((std::string("GlobalMaxPoolingForwardGPU") + dtypestr).c_str(), + &mink::GlobalMaxPoolingForwardGPU, + py::call_guard()); + m.def((std::string("GlobalMaxPoolingBackwardGPU") + dtypestr).c_str(), + &mink::GlobalMaxPoolingBackwardGPU, + py::call_guard()); + + m.def((std::string("BroadcastForwardGPU") + dtypestr).c_str(), + &mink::BroadcastForwardGPU, + py::call_guard()); + m.def((std::string("BroadcastBackwardGPU") + dtypestr).c_str(), + &mink::BroadcastBackwardGPU, + py::call_guard()); + + m.def((std::string("PruningForwardGPU") + dtypestr).c_str(), + &mink::PruningForwardGPU, + py::call_guard()); + m.def((std::string("PruningBackwardGPU") + dtypestr).c_str(), + &mink::PruningBackwardGPU, + py::call_guard()); + + m.def((std::string("UnionForwardGPU") + dtypestr).c_str(), + &mink::UnionForwardGPU, + py::call_guard()); + m.def((std::string("UnionBackwardGPU") + dtypestr).c_str(), + &mink::UnionBackwardGPU, + py::call_guard()); +} template void instantiate_coordsman(py::module &m) { std::string coords_name = std::string("CoordsManager"); @@ -199,9 +348,6 @@ template void instantiate_coordsman(py::module &m) { mink::CoordsManager::existsCoordsKey) .def("getCoordsKey", &mink::CoordsManager::getCoordsKey) .def("getKernelMap", &mink::CoordsManager::getKernelMap) -#ifndef CPU_ONLY - .def("getKernelMapGPU", &mink::CoordsManager::getKernelMapGPU) -#endif .def("getCoordsMap", &mink::CoordsManager::getCoordsMap) .def("getUnionMap", &mink::CoordsManager::getUnionMap) .def("getCoordsSize", @@ -235,12 +381,61 @@ template void instantiate_coordsman(py::module &m) { [](const mink::CoordsManager &a) { return a.toString(); }); } +template void instantiate_coordsman_gpu(py::module &m) { + std::string coords_name = std::string("GPUCoordsManager"); + py::class_>(m, coords_name.c_str()) +// .def(py::init()) + .def(py::init()) + .def("existsCoordsKey", + (bool (mink::GPUCoordsManager::*)(py::object) const) & + mink::GPUCoordsManager::existsCoordsKey) + .def("getCoordsKey", &mink::GPUCoordsManager::getCoordsKey) + .def("getKernelMap", &mink::GPUCoordsManager::getKernelMap) + .def("getCoordsMap", &mink::GPUCoordsManager::getCoordsMap) + .def("getUnionMap", &mink::GPUCoordsManager::getUnionMap) + .def("getCoordsSize", + (int (mink::GPUCoordsManager::*)(py::object) const) & + mink::GPUCoordsManager::getCoordsSize) + .def("getCoords", &mink::GPUCoordsManager::getCoords) + .def("getBatchSize", &mink::GPUCoordsManager::getBatchSize) + .def("getBatchIndices", &mink::GPUCoordsManager::getBatchIndices) + .def("getRowIndicesAtBatchIndex", + &mink::GPUCoordsManager::getRowIndicesAtBatchIndex) + .def("getRowIndicesPerBatch", + &mink::GPUCoordsManager::getRowIndicesPerBatch) + .def("setOriginCoordsKey", + &mink::GPUCoordsManager::setOriginCoordsKey) + .def("initializeCoords", + (uint64_t(mink::GPUCoordsManager::*)( + at::Tensor, at::Tensor, at::Tensor, py::object, const bool, + const bool, const bool, const bool)) & + mink::GPUCoordsManager::initializeCoords, + py::call_guard()) + .def("createStridedCoords", + &mink::GPUCoordsManager::createStridedCoords) + .def("createTransposedStridedRegionCoords", + &mink::GPUCoordsManager::createTransposedStridedRegionCoords) + .def("createPrunedCoords", + &mink::GPUCoordsManager::createPruningCoords) + .def("createOriginCoords", + &mink::GPUCoordsManager::createOriginCoords) +// .def("printDiagnostics", &mink::GPUCoordsManager::printDiagnostics) + .def("__repr__", + [](const mink::GPUCoordsManager &a) { return a.toString(); }); +} + template void instantiate(py::module &m) { instantiate_coordsman(m); instantiate_func(m, std::string("f")); instantiate_func(m, std::string("d")); } +template void instantiate_gpu(py::module &m) { + instantiate_coordsman_gpu(m); + instantiate_func_gpu(m, std::string("f")); + instantiate_func_gpu(m, std::string("d")); +} + template void bind_native(py::module &m) { std::string name = std::string("CoordsKey"); py::class_(m, name.c_str()) @@ -256,6 +451,7 @@ template void bind_native(py::module &m) { .def("__repr__", [](const mink::CoordsKey &a) { return a.toString(); }); // Quantization + // TODO(ljm): quantize_np and quantize_th only support CPU currently. m.def("quantize_np", &mink::quantize_np); m.def("quantize_th", &mink::quantize_th); m.def("quantize_label_np", &mink::quantize_label_np); @@ -271,4 +467,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { bind_native(m); instantiate(m); +#ifndef CPU_ONLY + instantiate_gpu(m); +#endif } diff --git a/setup.py b/setup.py index 314e257e..dd1f1b78 100644 --- a/setup.py +++ b/setup.py @@ -106,11 +106,17 @@ def _argparse(pattern, argv, is_flag=True): return arr[0].split("=")[1], argv +print("argv: ") +print(argv) # For cpu only build CPU_ONLY, argv = _argparse("--cpu_only", argv) CPU_ONLY = CPU_ONLY or not torch.cuda.is_available() KEEP_OBJS, argv = _argparse("--keep_objs", argv) FORCE_CUDA, argv = _argparse("--force_cuda", argv) +print("CPU_ONLY: ") +print(CPU_ONLY) +print("FORCE_CUDA: ") +print(FORCE_CUDA) # args with return value CUDA_HOME, argv = _argparse("--cuda_home", argv, False) @@ -125,7 +131,8 @@ def _argparse(pattern, argv, is_flag=True): "PYTHON=" + sys.executable, # curr python ] -extra_compile_args = ["-Wno-deprecated-declarations"] +extra_compile_args = [] +#extra_compile_args = ["-Wno-deprecated-declarations"] extra_link_args = [] libraries = ["minkowski"] @@ -140,6 +147,7 @@ def _argparse(pattern, argv, is_flag=True): else: # system python installation libraries.append("cusparse") + libraries.append("cudadevrt") if not (CUDA_HOME is False): # False when not set, str otherwise print(f"Using CUDA_HOME={CUDA_HOME}") @@ -187,6 +195,20 @@ def _argparse(pattern, argv, is_flag=True): run_command(*compile_args) +''' +print("extra_compile_args: ") +print(extra_compile_args) +print("extra_link_args: ") +print(extra_link_args) +extra_compile_args = { + #'cxx': ['-DBATCH_FIRST=1',], + 'cxx': ['-DBATCH_FIRST=1', '-MMD', '-MP', '-ffast-math', '-funsafe-math-optimizations', '-fno-math-errno', '-DBATCH_FIRST=1', '-fopenmp', '-fPIC', '-fwrapv', '-std=c++14', '-DNDEBUG', '-O3', '-DTORCH_API_INCLUDE_EXTENSION_H', '-DTORCH_EXTENSION_NAME=minkowski', '-D_GLIBCXX_USE_CXX11_ABI=0', '-Wall', '-Wcomment', '-Wno-sign-compare', '-Wno-deprecated-declarations',], + 'nvcc': ['-DBATCH_FIRST=1', '-arch=sm_61', '-rdc=true', '--compiler-options', '-fPIC'], + 'nvcclink': ['-arch=sm_61', '--device-link', '--compiler-options', '-fPIC'], + } +#extra_link_args = ['-pthread', '--device-link', '--compiler-options', '-fPIC', '-Wall', '-Wcomment', '-Wno-sign-compare', '-Wno-deprecated-declarations'] +extra_link_args = ['-pthread', '-fPIC', '-Wall', '-Wcomment', '-Wno-sign-compare', '-Wno-deprecated-declarations'] +''' # Python interface setup( name="MinkowskiEngine", @@ -199,8 +221,9 @@ def _argparse(pattern, argv, is_flag=True): name="MinkowskiEngineBackend", include_dirs=[here, get_python_inc() + "/.."], library_dirs=["objs"], - sources=["pybind/minkowski.cpp",], - libraries=libraries, + sources=["pybind/minkowski.cpp", + ], + libraries=libraries + ["cudart", "cudadevrt"], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, ) @@ -243,3 +266,37 @@ def _argparse(pattern, argv, is_flag=True): ], python_requires=">=3.6", ) + +''' + "src/convolution.cpp", + "src/math_functions.cpp", + "src/coordsmap.cpp", + "src/gpu_coordsmap.cpp", + "src/pooling_max.cpp", + "src/coords_key.cpp", + "src/pooling_avg.cpp", + "src/pooling_global_avg.cpp", + "src/quantization.cpp", + "src/pooling_global_max.cpp", + "src/pruning.cpp", + "src/3rdparty/gpu_coords_map/include/cuda_unordered_map.cpp", + "src/broadcast.cpp", + "src/coords_manager.cpp", + "src/gpu_coords_manager.cpp", + "src/region.cpp", + "src/pooling_transpose.cpp", + "src/convolution_transpose.cpp", + "src/union.cpp", + "src/pooling_avg.cu", + "src/union.cu", + "src/pooling_max.cu", + "src/math_functions.cu", + "src/pruning.cu", + "src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.cu", + "src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.cu", + "src/3rdparty/gpu_coords_map/include/coordinate.cu", + "src/broadcast.cu", + "src/gpu.cu", + "src/convolution.cu", + ], +''' diff --git a/src/3rdparty/gpu_coords_map/.clang-format b/src/3rdparty/gpu_coords_map/.clang-format new file mode 100644 index 00000000..678d1f6b --- /dev/null +++ b/src/3rdparty/gpu_coords_map/.clang-format @@ -0,0 +1,10 @@ +BasedOnStyle: Google +IndentWidth: 4 +ColumnLimit: 80 +UseTab: Never +Language: Cpp +Standard: Cpp11 +ContinuationIndentWidth: 8 +AccessModifierOffset: -4 +BinPackParameters: false +SortIncludes: true diff --git a/src/3rdparty/gpu_coords_map/CMakeLists.txt b/src/3rdparty/gpu_coords_map/CMakeLists.txt new file mode 100644 index 00000000..483b5640 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/CMakeLists.txt @@ -0,0 +1,72 @@ +cmake_minimum_required (VERSION 3.5 FATAL_ERROR) +project (SlabHash) + +find_package(CUDA 10.1 REQUIRED) + +option(CMAKE_VERBOSE_MAKEFILE ON) + +set(CUDA_NVCC_FLAGS -std=c++11) +set (CMAKE_CXX_STANDARD 11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +set(GENCODE_SM30 + -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_30,code=compute_30) +set(GENCODE_SM35 + -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_35,code=compute_35) +set(GENCODE_SM37 + -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_37,code=compute_37) +set(GENCODE_SM50 + -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_50,code=compute_50) +set(GENCODE_SM60 + -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60) +set(GENCODE_SM61 + -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61) +set(GENCODE_SM70 + -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70) +set(GENCODE_SM71 + -gencode=arch=compute_71,code=sm_71 -gencode=arch=compute_71,code=compute_71) + +option(SLABHASH_GENCODE_SM30 "GENCODE_SM30" OFF) +option(SLABHASH_GENCODE_SM35 "GENCODE_SM35" OFF) +option(SLABHASH_GENCODE_SM37 "GENCODE_SM37" OFF) +option(SLABHASH_GENCODE_SM50 "GENCODE_SM50" OFF) +option(SLABHASH_GENCODE_SM60 "GENCODE_SM60" OFF) +option(SLABHASH_GENCODE_SM61 "GENCODE_SM61" ON) +option(SLABHASH_GENCODE_SM70 "GENCODE_SM70" OFF) +option(SLABHASH_GENCODE_SM71 "GENCODE_SM71" OFF) + +if (SLABHASH_GENCODE_SM30) + set(GENCODE ${GENCODE} ${GENCODE_SM30}) +endif(SLABHASH_GENCODE_SM30) + +if (SLABHASH_GENCODE_SM35) + set(GENCODE ${GENCODE} ${GENCODE_SM35}) +endif(SLABHASH_GENCODE_SM35) + +if (SLABHASH_GENCODE_SM37) + set(GENCODE ${GENCODE} ${GENCODE_SM37}) +endif(SLABHASH_GENCODE_SM37) + +if (SLABHASH_GENCODE_SM50) + set(GENCODE ${GENCODE} ${GENCODE_SM50}) +endif(SLABHASH_GENCODE_SM50) + +if (SLABHASH_GENCODE_SM60) + set(GENCODE ${GENCODE} ${GENCODE_SM60}) +endif(SLABHASH_GENCODE_SM60) + +if (SLABHASH_GENCODE_SM61) + set(GENCODE ${GENCODE} ${GENCODE_SM61}) +endif(SLABHASH_GENCODE_SM61) + +if (SLABHASH_GENCODE_SM70) + set(GENCODE ${GENCODE} ${GENCODE_SM70}) +endif(SLABHASH_GENCODE_SM70) + +if(SLABHASH_GENCODE_SM71) + set(GENCODE ${GENCODE} ${GENCODE_SM71}) +endif(SLABHASH_GENCODE_SM71) + +include_directories(include) +add_subdirectory(test) diff --git a/src/3rdparty/gpu_coords_map/LICENSE b/src/3rdparty/gpu_coords_map/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/src/3rdparty/gpu_coords_map/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/src/3rdparty/gpu_coords_map/README.md b/src/3rdparty/gpu_coords_map/README.md new file mode 100644 index 00000000..1742fe59 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/README.md @@ -0,0 +1,72 @@ +# Multi-thread GPU CoordinateHash with Shared SLAB Router +Multi-thread version of GPU CoordinateHash with shared slab router + +## What's news? +- Light Head: the table head has been shrinked 32 times. +- Singleton: reuse slab router by making SlabAlloc singleton. +- Multi-thread: one slab router, multi table head. +- Random Hash: one random number per table head which reduces the + confliction. + +## Usage example: +more details in test_unique_with_remove_multithread.cu + +``` +int main() { + std::vector vt; + vt.reserve(50); + for (int i = 0; i != 50; ++i) { + vt.emplace_back(std::thread([i] { + TEST_6DIM_KEYS_THRUST(1000000); + std::cout << "Finish " << i << "th TEST_6DIM_KEYS_THRUST" << std::endl; + })); + } + + for (int i = 0; i != 50; ++i) { + vt[i].join(); + } +} +``` + +##TODO + + +1. General improvment: + + - reduce memory Alloc and memory copy times. + - cuda memory pool for key value storage. + +2. Custom it to specific usage: + + - custom kernel + - custom memory handling + +--------------------------------- + +# GPU CoordinateHash +This is a modified version of [SlabHash](https://github.com/owensgroup/SlabHash). + +Original SlabHash only supports key value pairs. +Now in theory it supports arbitrary value types, and multi dimensional keys. It also supports self-defined hash function in template. + +At current only `Key, Value` was tested. + +## Publication +This library is based on the original slab hash paper, initially proposed in the following IPDPS'18 paper: +* [Saman Ashkiani, Martin Farach-Colton, John Owens, *A Dynamic Hash Table for the GPU*, 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)](https://ieeexplore.ieee.org/abstract/document/8425196) + +This library is a rafactored and slightly redesigned version of the original code, so that it can be extended and be used in other research projects as well. It is still under continuous development. If you find any problem with the code, or suggestions for potential additions to the library, we will appreciate it if you can raise issues on github. We will address them as soon as possible. + +## Compilation +1. Make sure to edit `CMakeLists.txt` such that it reflects the GPU device's compute capability. For example, to include compute 3.5 you should have `option(SLABHASH_GENCODE_SM35 "GENCODE_SM35" ON)`. +2. `mkdir build && cd build` +3. `cmake ..` +4. `make -j4` + +## Usage +It is now a header only library. Include `coordinate_hash_map.cuh` or `coordinate_indexer.cuh` in your .cu file to use the lib. Documents TBD. + +## TODO +- Update copyrights to be consistent with the original Apache license from [SlabHash](https://github.com/owensgroup/SlabHash). +- Include parallel iterators. +- Add pybind and improve `touch/allocate` function for voxel hashing. diff --git a/src/3rdparty/gpu_coords_map/include/coordinate.cu b/src/3rdparty/gpu_coords_map/include/coordinate.cu new file mode 100644 index 00000000..5ce27910 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/coordinate.cu @@ -0,0 +1,35 @@ +#include "coordinate.h" + +///* +template +__device__ __host__ bool Coordinate::operator==(const Coordinate& rhs) const { + bool equal = true; +#pragma unroll 1 + for (size_t i = 0; i < D; ++i) { + equal = equal && (data_[i] == rhs[i]); + } + return equal; +} +//*/ + +template +struct CoordinateHashFunc { + __device__ __host__ uint64_t operator()(const Coordinate& key) const { + uint64_t hash = UINT64_C(14695981039346656037); + + /** We only support 4-byte and 8-byte types **/ + using input_t = typename std::conditional::type; +#pragma unroll 1 + for (size_t i = 0; i < D; ++i) { + hash ^= *((input_t*)(&key[i])); + hash *= UINT64_C(1099511628211); + } + return hash; + } +}; + +template class Coordinate; +template class Coordinate; +template class Coordinate; +template class Coordinate; diff --git a/src/3rdparty/gpu_coords_map/include/coordinate.h b/src/3rdparty/gpu_coords_map/include/coordinate.h new file mode 100644 index 00000000..dc661137 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/coordinate.h @@ -0,0 +1,59 @@ +// +// Created by dongw1 on 7/1/19. +// + +#include +#include + +template +struct Coordinate { +private: + T data_[D]; + +public: + __device__ __host__ T& operator[](size_t i) { return data_[i]; } + __device__ __host__ const T& operator[](size_t i) const { return data_[i]; } + + __device__ __host__ bool operator==(const Coordinate& rhs) const; + /* + __device__ __host__ bool operator==(const Coordinate& rhs) const { + bool equal = true; +#pragma unroll 1 + for (size_t i = 0; i < D; ++i) { + equal = equal && (data_[i] == rhs[i]); + } + return equal; + } + */ + + static __host__ Coordinate random( + std::default_random_engine generator, + std::uniform_int_distribution dist) { + Coordinate res; + for (size_t i = 0; i < D; ++i) { + res.data_[i] = dist(generator); + } + return res; + } +}; + +template +struct CoordinateHashFunc; +/* +template +struct CoordinateHashFunc { + __device__ __host__ uint64_t operator()(const Coordinate& key) const { + uint64_t hash = UINT64_C(14695981039346656037); + + // We only support 4-byte and 8-byte types + using input_t = typename std::conditional::type; +#pragma unroll 1 + for (size_t i = 0; i < D; ++i) { + hash ^= *((input_t*)(&key[i])); + hash *= UINT64_C(1099511628211); + } + return hash; + } +}; +*/ diff --git a/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.cpp b/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.cpp new file mode 100644 index 00000000..91782030 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.cpp @@ -0,0 +1,294 @@ +#include "cuda_unordered_map.h" +#include "coordinate.h" + +namespace cuda { +////////////// +template +void unordered_map::BulkInsert(const int* p_coords, + int* p_mapping, + int* p_inverse_mapping, + int size, int key_chunks_) { + assert(key_chunks_ == key_chunks); + slab_hash_->BulkInsertWithMapping(p_coords, p_mapping, + p_inverse_mapping, size); +} + +template +void +unordered_map:: +IterateKeys(int* p_coords, int size) { + slab_hash_->IterateKeys(p_coords, size); +} + +template +void +unordered_map:: +IterateSearchAtBatch(int* p_out, int batch_index, int size) { + slab_hash_->IterateSearchAtBatch(p_out, batch_index, size); +} + +template +void +unordered_map:: +IterateSearchPerBatch(const std::vector& p_outs, int size) { + slab_hash_->IterateSearchPerBatch(p_outs, size); +} + +template +void +unordered_map:: +IterateOffsetInsert(const std::shared_ptr>& in_map, + int* p_offset, int size) { + slab_hash_->IterateOffsetInsert(in_map->get_slab_hash(), + p_offset, size); +} + +template +void +unordered_map:: +IterateOffsetInsertWithInsOuts(const std::shared_ptr>& in_map, + int* p_offset, + int* p_in, int* p_out, + int size) {} + +template +void +unordered_map:: +IterateOffsetSearch(const std::shared_ptr>& in_map, + int* p_offset, + int* p_in, int* p_out, + int size) {} + +template +void +unordered_map:: +IterateBatchInsert(const std::shared_ptr>& in_map, + int size) {} + +template +void +unordered_map:: +IterateBatchSearch(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + int size) {} + +template +void +unordered_map:: +IterateStrideInsert(const std::shared_ptr>& in_map, + const std::vector& tensor_strides, + int size) {} + +template +void +unordered_map:: +IterateStrideInsertWithInOut(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + const std::vector& tensor_strides, + int size) {} + +template +void +unordered_map:: +IterateStrideSearch(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + const std::vector& tensor_strides, + int size) {} + +template +void +unordered_map:: +IterateInsert(const std::shared_ptr>& in_map, + int size) {} + +template +void +unordered_map:: +IterateInsertWithInsOuts(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + int size) {} + +template +void +unordered_map:: +IterateSearch(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + int size) {} + +template +void +unordered_map:: +IteratePruneInsert(const std::shared_ptr>& in_map, + bool* p_keep, int keep_size, + int size) {} + +template +void +unordered_map:: +IteratePruneInsertWithInOut(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + bool* p_keep, int keep_size, + int size) {} + +template +void +unordered_map:: +IteratePruneSearch(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + bool* p_keep, int keep_size, + int size) {} +////////////// + +template class unordered_map, int, 5, 5>; + +} // cuda diff --git a/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.h b/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.h new file mode 100644 index 00000000..d82fb7d2 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/cuda_unordered_map.h @@ -0,0 +1,584 @@ +/* + * Copyright 2019 Saman Ashkiani, + * Modified 2019 by Wei Dong + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include +#include "slab_hash/slab_hash.h" + +/* + * Default hash function: + * It treat any kind of input as a concatenation of ints. + */ +template +struct hash { + __device__ __host__ uint64_t operator()(const Key& key) const { + uint64_t hash = UINT64_C(14695981039346656037); + + const int chunks = sizeof(Key) / sizeof(int); + for (size_t i = 0; i < chunks; ++i) { + hash ^= ((int32_t*)(&key))[i]; + hash *= UINT64_C(1099511628211); + } + return hash; + } +}; + +/* Lightweight wrapper to handle host input */ +/* Key supports elementary types: int, long, etc. */ +/* Value supports arbitrary types in theory. */ +/* std::vector is specialized: it stores only one bit per element + * We have to use uint8_t instead to read and write masks + * https://en.wikipedia.org/w/index.php?title=Sequence_container_(C%2B%2B)&oldid=767869909#Specialization_for_bool + */ +namespace cuda { +using slab_hash::SlabHash; +using slab_hash::CudaAllocator; + +template , + class Alloc = CudaAllocator> +class unordered_map { +public: + using key_type = Key; + using value_type = Value; +public: +// static constexpr uint32_t LOG_NUM_MEM_BLOCKS = 5; +// static constexpr uint32_t LOG_NUM_SUPER_BLOCKS = 5; + static constexpr uint32_t key_chunks = sizeof(Key) / sizeof(uint32_t); + static constexpr uint32_t value_chunks = sizeof(Value) / sizeof(uint32_t); + static constexpr uint32_t MEM_UNIT_WARP_MULTIPLES = key_chunks + value_chunks; +// (sizeof(Key) + sizeof(Value)) / sizeof(uint32_t); +public: + unordered_map() {} + unordered_map(uint32_t max_keys, + /* Preset hash table params to estimate bucket num */ + float duplicate_factor = + 1.0 / pow(2, sizeof(Key) / sizeof(uint32_t) - 1), + uint32_t keys_per_bucket = 31 * 2, + /* CUDA device */ + const uint32_t device_idx = 0); + ~unordered_map(); + + void reserve(uint32_t max_keys, + /* Preset hash table params to estimate bucket num */ + float duplicate_factor = + 1.0 / pow(2, sizeof(Key) / sizeof(uint32_t) - 1), + uint32_t keys_per_bucket = 31 * 2, + /* CUDA device */ + const uint32_t device_idx = 0); + + /* Minimal output */ + /* No output for Insert */ + Value Size(); + Value BulkBuild(const std::vector& input_keys); + Value BulkBuild(thrust::device_vector& input_keys); + Value BulkBuild(Key* input_keys, int num_keys); + + /* Value and mask output for Search */ + std::pair, thrust::device_vector> + Search(const std::vector& input_keys); + std::pair, thrust::device_vector> + Search(thrust::device_vector& input_keys); + std::pair, thrust::device_vector> + Search(Key* input_keys, int num_keys); + + /* No output for Remove */ + void Remove(const std::vector& input_keys); + void Remove(thrust::device_vector& input_keys); + void Remove(Key* input_keys, int num_keys); + + std::pair>, + thrust::device_vector> + Search_(thrust::device_vector& input_keys); + + thrust::device_vector Remove_( + thrust::device_vector& input_keys); + + /* Assistance functions */ + float ComputeLoadFactor(); + std::vector CountElemsPerBucket(); + void CountElems(thrust::device_vector& count); + + ////////////// + void BulkInsert(const int* p_coords, + int* p_mapping, + int* p_inverse_mapping, + int size, int key_chunks_); + void IterateKeys(int* p_coords, int size); + void IterateSearchAtBatch(int* p_out, int batch_index, int size); + void IterateSearchPerBatch(const std::vector& p_outs, int size); + void IterateOffsetInsert(const std::shared_ptr>& in_map, + int* p_offset, int size); + void IterateOffsetInsertWithInsOuts(const std::shared_ptr>& in_map, + int* p_offset, + int* p_in, int* p_out, + int size); + void IterateOffsetSearch(const std::shared_ptr>& in_map, + int* p_offset, + int* p_in, int* p_out, + int size); + void IterateBatchInsert(const std::shared_ptr>& in_map, + int size); + void IterateBatchSearch(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + int size); + void IterateStrideInsert(const std::shared_ptr>& in_map, + const std::vector& tensor_strides, + int size); + void IterateStrideInsertWithInOut(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + const std::vector& tensor_strides, + int size); + void IterateStrideSearch(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + const std::vector& tensor_strides, + int size); + void IterateInsert(const std::shared_ptr>& in_map, + int size); + void IterateInsertWithInsOuts(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + int size); + void IterateSearch(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + int size); + void IteratePruneInsert(const std::shared_ptr>& in_map, + bool* p_keep, int keep_size, + int size); + void IteratePruneInsertWithInOut(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + bool* p_keep, int keep_size, + int size); + void IteratePruneSearch(const std::shared_ptr>& in_map, + int* p_in, int* p_out, + bool* p_keep, int keep_size, + int size); + ////////////// + + const std::shared_ptr>& + get_slab_hash() const { + return slab_hash_; + } + +private: + uint32_t max_keys_; + uint32_t num_buckets_; + uint32_t cuda_device_idx_; + + /* Buffer for input cpu data (e.g. from std::vector) */ + Key* input_key_buffer_; + Key* output_key_buffer_; + Value* output_value_buffer_; + _Iterator* output_iterator_buffer_; + uint8_t* output_mask_buffer_; + + std::shared_ptr> slab_hash_; + std::shared_ptr allocator_; +}; + +/* +template +unordered_map::unordered_map( + uint32_t max_keys, + float duplicate_factor, + uint32_t keys_per_bucket, + const uint32_t device_idx) { + reserve(max_keys, duplicate_factor, keys_per_bucket, device_idx); +} +*/ + +template +unordered_map::unordered_map( + uint32_t max_keys, + float duplicate_factor, + uint32_t keys_per_bucket, + const uint32_t device_idx) { + max_keys_ = max_keys; + cuda_device_idx_ = device_idx; + /* Set bucket size */ + uint32_t expected_unique_keys = max_keys * duplicate_factor; + num_buckets_ = (expected_unique_keys + keys_per_bucket - 1) / keys_per_bucket; + + /* Set device */ + int32_t cuda_device_count_ = 0; + CHECK_CUDA(cudaGetDeviceCount(&cuda_device_count_)); + assert(cuda_device_idx_ < cuda_device_count_); + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + allocator_ = std::make_shared(cuda_device_idx_); + + // allocating key, value arrays to buffer input and output: + input_key_buffer_ = allocator_->template allocate(max_keys_); + output_key_buffer_ = allocator_->template allocate(max_keys_); + output_value_buffer_ = allocator_->template allocate(max_keys_); + output_mask_buffer_ = allocator_->template allocate(max_keys_); + output_iterator_buffer_ = + allocator_->template allocate<_Iterator>(max_keys_); + + // allocate an initialize the allocator: + slab_hash_ = std::make_shared>( + num_buckets_, max_keys_, cuda_device_idx_); +} + +template +unordered_map::~unordered_map() { + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + + allocator_->template deallocate(input_key_buffer_); + allocator_->template deallocate(output_key_buffer_); + allocator_->template deallocate(output_value_buffer_); + allocator_->template deallocate(output_mask_buffer_); + allocator_->template deallocate<_Iterator>( + output_iterator_buffer_); +} + +template +Value unordered_map::Size() { + auto elems_per_bucket = slab_hash_->CountElemsPerBucket(); + int total_elems_stored = std::accumulate(elems_per_bucket.begin(), + elems_per_bucket.end(), 0); + return total_elems_stored; +} + +template +Value unordered_map::BulkBuild( + const std::vector& input_keys) { + assert(input_keys.size() <= max_keys_); + + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + CHECK_CUDA(cudaMemcpy(input_key_buffer_, input_keys.data(), + sizeof(Key) * input_keys.size(), + cudaMemcpyHostToDevice)); + + slab_hash_->InsertAtomic(input_key_buffer_, + input_keys.size()); + return slab_hash_->Size(); +} + +template +Value unordered_map::BulkBuild( + thrust::device_vector& input_keys) { + assert(input_keys.size() <= max_keys_); + + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + slab_hash_->InsertAtomic(thrust::raw_pointer_cast(input_keys.data()), + input_keys.size()); + return slab_hash_->Size(); +} + +template +Value unordered_map::BulkBuild(Key* input_keys, + int num_keys) { + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + slab_hash_->InsertAtomic(input_keys, num_keys); + return slab_hash_->Size(); +} + +template +std::pair, thrust::device_vector> +unordered_map::Search( + const std::vector& input_keys) { + assert(input_keys.size() <= max_keys_); + + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + CHECK_CUDA(cudaMemset(output_mask_buffer_, 0, + sizeof(uint8_t) * input_keys.size())); + + CHECK_CUDA(cudaMemcpy(input_key_buffer_, input_keys.data(), + sizeof(Key) * input_keys.size(), + + cudaMemcpyHostToDevice)); + slab_hash_->Search(input_key_buffer_, output_value_buffer_, + output_mask_buffer_, input_keys.size()); + CHECK_CUDA(cudaDeviceSynchronize()); + + thrust::device_vector output_values( + output_value_buffer_, output_value_buffer_ + input_keys.size()); + thrust::device_vector output_masks( + output_mask_buffer_, output_mask_buffer_ + input_keys.size()); + return std::make_pair(output_values, output_masks); +} + +template +std::pair, thrust::device_vector> +unordered_map::Search( + thrust::device_vector& input_keys) { + assert(input_keys.size() <= max_keys_); + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + CHECK_CUDA(cudaMemset(output_mask_buffer_, 0, + sizeof(uint8_t) * input_keys.size())); + + slab_hash_->Search(thrust::raw_pointer_cast(input_keys.data()), + output_value_buffer_, output_mask_buffer_, + input_keys.size()); + CHECK_CUDA(cudaDeviceSynchronize()); + + thrust::device_vector output_values( + output_value_buffer_, output_value_buffer_ + input_keys.size()); + thrust::device_vector output_masks( + output_mask_buffer_, output_mask_buffer_ + input_keys.size()); + return std::make_pair(output_values, output_masks); +} + +template +std::pair, thrust::device_vector> +unordered_map::Search(Key* input_keys, int num_keys) { + assert(num_keys <= max_keys_); + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + CHECK_CUDA(cudaMemset(output_mask_buffer_, 0, sizeof(uint8_t) * num_keys)); + + slab_hash_->Search(input_keys, output_value_buffer_, output_mask_buffer_, + num_keys); + CHECK_CUDA(cudaDeviceSynchronize()); + + thrust::device_vector output_values(output_value_buffer_, + output_value_buffer_ + num_keys); + thrust::device_vector output_masks(output_mask_buffer_, + output_mask_buffer_ + num_keys); + return std::make_pair(output_values, output_masks); +} + +template +void unordered_map::Remove( + const std::vector& input_keys) { + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + CHECK_CUDA(cudaMemcpy(input_key_buffer_, input_keys.data(), + sizeof(Key) * input_keys.size(), + cudaMemcpyHostToDevice)); + slab_hash_->Remove(input_key_buffer_, input_keys.size()); +} + +template +void unordered_map::Remove( + thrust::device_vector& input_keys) { + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + + slab_hash_->Remove(thrust::raw_pointer_cast(input_keys.data()), + input_keys.size()); +} + +template +void unordered_map::Remove(Key* input_keys, + int num_keys) { + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + slab_hash_->Remove(input_keys, num_keys); +} + +template +std::pair>, + thrust::device_vector> +unordered_map::Search_( + thrust::device_vector& input_keys) { + assert(input_keys.size() <= max_keys_); + + CHECK_CUDA(cudaSetDevice(cuda_device_idx_)); + CHECK_CUDA(cudaMemset(output_mask_buffer_, 0, + sizeof(uint8_t) * input_keys.size())); + + slab_hash_->Search_(thrust::raw_pointer_cast(input_keys.data()), + output_iterator_buffer_, output_mask_buffer_, + input_keys.size()); + CHECK_CUDA(cudaDeviceSynchronize()); + + thrust::device_vector<_Iterator> output_iterators( + output_iterator_buffer_, + output_iterator_buffer_ + input_keys.size()); + thrust::device_vector output_masks( + output_mask_buffer_, output_mask_buffer_ + input_keys.size()); + return std::make_pair(output_iterators, output_masks); +} + +template +std::vector unordered_map::CountElemsPerBucket() { + return slab_hash_->CountElemsPerBucket(); +} + +template +void unordered_map::CountElems( + thrust::device_vector& count) { + std::vector std_count(3); + thrust::copy(count.begin(), count.end(), std_count.begin()); + printf("Before count: %d\t%d\t%d\n", std_count[0], std_count[1], std_count[2]); + slab_hash_->CountElems(thrust::raw_pointer_cast(count.data())); + thrust::copy(count.begin(), count.end(), std_count.begin()); + printf("After count: %d\t%d\t%d\n", std_count[0], std_count[1], std_count[2]); + assert(std_count[0] == 0); + assert(std_count[1] == 0); + assert(std_count[2] == 0); +} + +template +float unordered_map::ComputeLoadFactor() { + return slab_hash_->ComputeLoadFactor(); +} +} // namespace cuda diff --git a/src/3rdparty/gpu_coords_map/include/helper_cuda.h b/src/3rdparty/gpu_coords_map/include/helper_cuda.h new file mode 100644 index 00000000..68f460e1 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/helper_cuda.h @@ -0,0 +1,55 @@ +/* + * Copyright 2018 Saman Ashkiani + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#define CHECK_CUDA(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +class CudaTimer { +public: + CudaTimer() { + CHECK_CUDA(cudaEventCreate(&start_)); + CHECK_CUDA(cudaEventCreate(&stop_)); + } + ~CudaTimer() { + CHECK_CUDA(cudaEventDestroy(start_)); + CHECK_CUDA(cudaEventDestroy(stop_)); + } + + void Start() { CHECK_CUDA(cudaEventRecord(start_, 0)); } + + float Stop() { + float time; + CHECK_CUDA(cudaEventRecord(stop_, 0)); + CHECK_CUDA(cudaEventSynchronize(stop_)); + CHECK_CUDA(cudaEventElapsedTime(&time, start_, stop_)); + return time; + } + +private: + cudaEvent_t start_; + cudaEvent_t stop_; +}; diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/allocator.h b/src/3rdparty/gpu_coords_map/include/slab_hash/allocator.h new file mode 100644 index 00000000..dfc7bed3 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/slab_hash/allocator.h @@ -0,0 +1,63 @@ +#include "../helper_cuda.h" +#include "config.h" + +#pragma once + +namespace slab_hash { + +class Allocator { +public: + Allocator(int device_id = 0) : device_id_(device_id) {} + template + T* allocate(size_t size) {} + + template + void deallocate(T* ptr) {} + +protected: + int device_id_; +}; + +class CudaAllocator: public Allocator { +public: + CudaAllocator(int device_id = 0) : Allocator(device_id) {} + template + T* allocate(size_t size) { + T* ptr; + CHECK_CUDA(cudaMalloc((void**)&ptr, sizeof(T) * size)); + return ptr; + } + + template + void deallocate(T* ptr) { + CHECK_CUDA(cudaFree(ptr)); + } +}; + +/** +class PyTorchAllocator: public Allocator { +public: + PyTorchAllocator(int device_id = 0) : Allocator(device_id) {} + + template + T* allocate(size_t size) { + CHECK_CUDA(cudaGetDevice(&device_id_)); + auto options = torch::TensorOptions() + .dtype(torch::kInt8) + .device(torch::kCUDA, device_id_) + .requires_grad(false); + tensor_ = torch::zeros(sizeof(T) * size, options); + return tensor_.data() + } + + template + void deallocate(T* ptr) { + // let PyTorch handle this + } + +protected: + torch::Tensor tensor_; +}; +**/ + +} // namespace slab_hash diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/config.h b/src/3rdparty/gpu_coords_map/include/slab_hash/config.h new file mode 100644 index 00000000..473d56cc --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/slab_hash/config.h @@ -0,0 +1,43 @@ +/* + * Copyright 2019 Saman Ashkiani + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +/** Built-in flags **/ +static constexpr uint32_t EMPTY_SLAB_PTR = 0xFFFFFFFF; +static constexpr uint32_t EMPTY_PAIR_PTR = 0xFFFFFFFF; +static constexpr uint32_t HEAD_SLAB_PTR = 0xFFFFFFFE; + +/** Queries **/ +static constexpr uint32_t SEARCH_NOT_FOUND = 0xFFFFFFFF; + +/** Warp operations **/ +static constexpr uint32_t WARP_WIDTH = 32; +static constexpr uint32_t BLOCKSIZE_ = 128; + +/* bits: 31 | 30 | ... | 3 | 2 | 1 | 0 */ +static constexpr uint32_t ACTIVE_LANES_MASK = 0xFFFFFFFF; +static constexpr uint32_t PAIR_PTR_LANES_MASK = 0x7FFFFFFF; +static constexpr uint32_t NEXT_SLAB_PTR_LANE = 31; + +using addr_t = uint32_t; + +/* These types are all the same, but distiguish the naming can lead to clearer + * meanings*/ +using ptr_t = uint32_t; +static constexpr uint32_t NULL_ITERATOR = 0xFFFFFFFF; diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.cu b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.cu new file mode 100644 index 00000000..0d53cecd --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.cu @@ -0,0 +1,151 @@ +#include +#include +#include "slab_alloc.h" + +namespace slab_hash { + +template +std::vector SlabAlloc<_Alloc, _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>:: + CountSlabsPerSuperblock() { + const uint32_t num_super_blocks = slab_alloc_context_.num_super_blocks_; + + auto slabs_per_superblock_buffer = + allocator_->template allocate(num_super_blocks); + thrust::device_vector slabs_per_superblock( + slabs_per_superblock_buffer, + slabs_per_superblock_buffer + num_super_blocks); + thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(), + 0); + + // counting total number of allocated memory units: + int blocksize = 128; + int num_mem_units = + slab_alloc_context_.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * 32; + int num_cuda_blocks = (num_mem_units + blocksize - 1) / blocksize; + CountSlabsPerSuperblockKernel<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES><<>>( + slab_alloc_context_, + thrust::raw_pointer_cast(slabs_per_superblock.data())); + + std::vector result(num_super_blocks); + thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(), + result.begin()); + allocator_->template deallocate(slabs_per_superblock_buffer); + return std::move(result); + } +template + __device__ uint32_t SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>:: + WarpAllocate(const uint32_t& lane_id) { + // tries and allocate a new memory units within the resident memory + // block if it returns 0xFFFFFFFF, then there was not any empty memory + // unit a new resident block should be chosen, and repeat again + // allocated result: _LOG_NUM_SUPER_BLOCKS bits: super_block_index + // (22 - _LOG_NUM_SUPER_BLOCKS) bits: memory block index + // 5 bits: memory unit index (hi-bits of 10bit) + // 5 bits: memory unit index (lo-bits of 10bit) + int empty_lane = -1; + uint32_t free_lane; + uint32_t read_bitmap = resident_bitmap_; + uint32_t allocated_result = 0xFFFFFFFF; + // works as long as <31 bit are used in the allocated_result + // in other words, if there are 32 super blocks and at most 64k blocks + // per super block + + while (allocated_result == 0xFFFFFFFF) { + empty_lane = __ffs(~resident_bitmap_) - 1; + free_lane = __ballot_sync(0xFFFFFFFF, empty_lane >= 0); + if (free_lane == 0) { + // all bitmaps are full: need to be rehashed again: + updateMemBlockIndex(((threadIdx.x + blockIdx.x * blockDim.x) >> + 5) + hash_coef_); + read_bitmap = resident_bitmap_; + continue; + } + uint32_t src_lane = __ffs(free_lane) - 1; + if (src_lane == lane_id) { + read_bitmap = atomicCAS( + super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_ + + resident_index_ * BITMAP_SIZE_ + lane_id, + resident_bitmap_, resident_bitmap_ | (1 << empty_lane)); + if (read_bitmap == resident_bitmap_) { + // successful attempt: + resident_bitmap_ |= (1 << empty_lane); + allocated_result = + (super_block_index_ + << SUPER_BLOCK_BIT_OFFSET_ALLOC_) | + (resident_index_ << MEM_BLOCK_BIT_OFFSET_ALLOC_) | + (lane_id << MEM_UNIT_BIT_OFFSET_ALLOC_) | + empty_lane; + } else { + // Not successful: updating the current bitmap + resident_bitmap_ = read_bitmap; + } + } + // asking for the allocated result; + allocated_result = + __shfl_sync(0xFFFFFFFF, allocated_result, src_lane); + } + return allocated_result; + } + + // called when the allocator fails to find an empty unit to allocate: +template + __device__ void SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::updateMemBlockIndex(uint32_t global_warp_id) { + num_attempts_++; + assert(num_attempts_ < 11); + super_block_index_++; + super_block_index_ = (super_block_index_ == num_super_blocks_) + ? 0 + : super_block_index_; + + resident_index_++; + resident_index_ = (resident_index_ == NUM_MEM_BLOCKS_PER_SUPER_BLOCK_) + ? 0 + : resident_index_; + + // loading the assigned memory block: + resident_bitmap_ = + *((super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_) + + resident_index_ * BITMAP_SIZE_ + (threadIdx.x & 0x1F)); + } + +template +__global__ void CountSlabsPerSuperblockKernel(SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> context, + uint32_t* slabs_per_superblock) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + int num_bitmaps = context.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * 32; + if (tid >= num_bitmaps) { + return; + } + + for (int i = 0; i < context.num_super_blocks_; i++) { + uint32_t read_bitmap = *(context.get_ptr_for_bitmap(i, tid)); + atomicAdd(&slabs_per_superblock[i], __popc(read_bitmap)); + } +} + +template class SlabAlloc; +template class SlabAllocContext<5, 5, 5>; + +//template __device__ uint32_t SlabAllocContext<5, 5, 5>::WarpAllocate(const uint32_t& lane_id); + +} // namespace slab_hash diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.h b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.h new file mode 100644 index 00000000..aa543e01 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_alloc.h @@ -0,0 +1,424 @@ +/* + * Copyright 2018 Saman Ashkiani + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include "../helper_cuda.h" +#include "allocator.h" +#include "config.h" +/* + * This class does not own any memory, and will be shallowly copied into device + * kernel + */ + +namespace slab_hash { + +template +class SlabAllocContext { +public: + static constexpr uint32_t NUM_SUPER_BLOCKS_ALLOCATOR_ = + (1 << _LOG_NUM_SUPER_BLOCKS); + + // fixed parameters for the SlabAlloc + static constexpr uint32_t NUM_MEM_UNITS_PER_BLOCK_ = 1024; + static constexpr uint32_t NUM_BITMAP_PER_MEM_BLOCK_ = 32; + static constexpr uint32_t BITMAP_SIZE_ = 32; + static constexpr uint32_t WARP_SIZE = 32; + static constexpr uint32_t MEM_UNIT_SIZE_ = + _MEM_UNIT_WARP_MULTIPLES * (WARP_SIZE - 1) + 1; + static constexpr uint32_t SUPER_BLOCK_BIT_OFFSET_ALLOC_ = + 32 - _LOG_NUM_SUPER_BLOCKS; + static constexpr uint32_t MEM_BLOCK_BIT_OFFSET_ALLOC_ = 10; + static constexpr uint32_t MEM_UNIT_BIT_OFFSET_ALLOC_ = 5; + static constexpr uint32_t NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ = + (1 << _LOG_NUM_MEM_BLOCKS); + static constexpr uint32_t MEM_BLOCK_SIZE_ = + NUM_MEM_UNITS_PER_BLOCK_ * MEM_UNIT_SIZE_; + static constexpr uint32_t SUPER_BLOCK_SIZE_ = + ((BITMAP_SIZE_ + MEM_BLOCK_SIZE_) * + NUM_MEM_BLOCKS_PER_SUPER_BLOCK_); + static constexpr uint32_t MEM_BLOCK_OFFSET_ = + (BITMAP_SIZE_ * NUM_MEM_BLOCKS_PER_SUPER_BLOCK_); + static constexpr uint32_t num_super_blocks_ = NUM_SUPER_BLOCKS_ALLOCATOR_; + + static constexpr uint32_t MEM_BLOCKS_MASK_ = ((1 << _LOG_NUM_MEM_BLOCKS) - 1); + static constexpr uint32_t SUPER_BLOCKS_MASK_ = ((1 << _LOG_NUM_SUPER_BLOCKS) - 1); + static constexpr uint32_t MEM_BLOCK_MASK_ = ((1 << (SUPER_BLOCK_BIT_OFFSET_ALLOC_ - + MEM_BLOCK_BIT_OFFSET_ALLOC_)) - 1); + static constexpr uint32_t MEM_UNIT_MASK_ = ((1 << MEM_BLOCK_BIT_OFFSET_ALLOC_) - 1); + + SlabAllocContext() + : super_blocks_(nullptr), + hash_coef_(0), + num_attempts_(0), + resident_index_(0), + super_block_index_(0), + allocated_index_(0) {} + + SlabAllocContext& operator=(const SlabAllocContext& rhs) { + super_blocks_ = rhs.super_blocks_; + hash_coef_ = rhs.hash_coef_; + num_attempts_ = 0; + resident_index_ = 0; + super_block_index_ = 0; + allocated_index_ = 0; + return *this; + } + + ~SlabAllocContext() {} + + void Setup(uint32_t* super_blocks) { + super_blocks_ = super_blocks; + } + + __device__ __forceinline__ uint32_t* get_unit_ptr_from_slab( + const addr_t& next, const uint32_t& lane_id) { + return super_blocks_ + addressDecoder(next) + lane_id * _MEM_UNIT_WARP_MULTIPLES; + } + __device__ __forceinline__ uint32_t* get_ptr_for_bitmap( + const uint32_t super_block_index, const uint32_t bitmap_index) { + return super_blocks_ + super_block_index * SUPER_BLOCK_SIZE_ + + bitmap_index; + } + + // Objective: each warp selects its own resident warp allocator: + __device__ void Init(uint32_t& hash_coef, uint32_t& tid, uint32_t& lane_id) { + // resident in register per thread + // call on before every insertion + hash_coef_ = hash_coef; + num_attempts_ = 0; + // hashing the memory block to be used: + createMemBlockIndex((tid >> 5) + hash_coef_); + + // loading the assigned memory block: + resident_bitmap_ = + *(super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_ + + resident_index_ * BITMAP_SIZE_ + lane_id); + allocated_index_ = 0xFFFFFFFF; + } + + __device__ uint32_t WarpAllocate(const uint32_t& lane_id); + /* + __device__ uint32_t WarpAllocate(const uint32_t& lane_id) { + // tries and allocate a new memory units within the resident memory + // block if it returns 0xFFFFFFFF, then there was not any empty memory + // unit a new resident block should be chosen, and repeat again + // allocated result: _LOG_NUM_SUPER_BLOCKS bits: super_block_index + // (22 - _LOG_NUM_SUPER_BLOCKS) bits: memory block index + // 5 bits: memory unit index (hi-bits of 10bit) + // 5 bits: memory unit index (lo-bits of 10bit) + int empty_lane = -1; + uint32_t free_lane; + uint32_t read_bitmap = resident_bitmap_; + uint32_t allocated_result = 0xFFFFFFFF; + // works as long as <31 bit are used in the allocated_result + // in other words, if there are 32 super blocks and at most 64k blocks + // per super block + + while (allocated_result == 0xFFFFFFFF) { + empty_lane = __ffs(~resident_bitmap_) - 1; + free_lane = __ballot_sync(0xFFFFFFFF, empty_lane >= 0); + if (free_lane == 0) { + // all bitmaps are full: need to be rehashed again: + updateMemBlockIndex(((threadIdx.x + blockIdx.x * blockDim.x) >> + 5) + hash_coef_); + read_bitmap = resident_bitmap_; + continue; + } + uint32_t src_lane = __ffs(free_lane) - 1; + if (src_lane == lane_id) { + read_bitmap = atomicCAS( + super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_ + + resident_index_ * BITMAP_SIZE_ + lane_id, + resident_bitmap_, resident_bitmap_ | (1 << empty_lane)); + if (read_bitmap == resident_bitmap_) { + // successful attempt: + resident_bitmap_ |= (1 << empty_lane); + allocated_result = + (super_block_index_ + << SUPER_BLOCK_BIT_OFFSET_ALLOC_) | + (resident_index_ << MEM_BLOCK_BIT_OFFSET_ALLOC_) | + (lane_id << MEM_UNIT_BIT_OFFSET_ALLOC_) | + empty_lane; + } else { + // Not successful: updating the current bitmap + resident_bitmap_ = read_bitmap; + } + } + // asking for the allocated result; + allocated_result = + __shfl_sync(0xFFFFFFFF, allocated_result, src_lane); + } + return allocated_result; + } + */ + + // This function, frees a recently allocated memory unit by a single thread. + // Since it is untouched, there shouldn't be any worries for the actual + // memory contents to be reset again. + __device__ void FreeUntouched(addr_t ptr) { + atomicAnd(super_blocks_ + getSuperBlockIndex(ptr) * SUPER_BLOCK_SIZE_ + + getMemBlockIndex(ptr) * BITMAP_SIZE_ + + (getMemUnitIndex(ptr) >> 5), + ~(1 << (getMemUnitIndex(ptr) & 0x1F))); + } + +private: + // ========= + // some helper inline address functions: + // ========= + __device__ __host__ __forceinline__ uint32_t + getSuperBlockIndex(addr_t address) const { + return address >> SUPER_BLOCK_BIT_OFFSET_ALLOC_; + } + __device__ __host__ __forceinline__ uint32_t + getMemBlockIndex(addr_t address) const { + return (address >> MEM_BLOCK_BIT_OFFSET_ALLOC_) & MEM_BLOCK_MASK_; + } + __device__ __host__ __forceinline__ addr_t + getMemBlockAddress(addr_t address) const { + return (MEM_BLOCK_OFFSET_ + + getMemBlockIndex(address) * MEM_BLOCK_SIZE_); + } + __device__ __host__ __forceinline__ uint32_t + getMemUnitIndex(addr_t address) const { + return address & MEM_UNIT_MASK_; + } + __device__ __host__ __forceinline__ addr_t + getMemUnitAddress(addr_t address) { + return getMemUnitIndex(address) * MEM_UNIT_SIZE_; + } + + // called at the beginning of the kernel: + __device__ void createMemBlockIndex(uint32_t global_warp_id) { + super_block_index_ = global_warp_id & SUPER_BLOCKS_MASK_; + resident_index_ = + (global_warp_id >> _LOG_NUM_SUPER_BLOCKS) & + MEM_BLOCKS_MASK_; + } + + __device__ void updateMemBlockIndex(uint32_t global_warp_id); + /* + // called when the allocator fails to find an empty unit to allocate: + __device__ void updateMemBlockIndex(uint32_t global_warp_id) { + num_attempts_++; + assert(num_attempts_ < 11); + super_block_index_++; + super_block_index_ = (super_block_index_ == num_super_blocks_) + ? 0 + : super_block_index_; + + resident_index_++; + resident_index_ = (resident_index_ == NUM_MEM_BLOCKS_PER_SUPER_BLOCK_) + ? 0 + : resident_index_; + + // loading the assigned memory block: + resident_bitmap_ = + *((super_blocks_ + super_block_index_ * SUPER_BLOCK_SIZE_) + + resident_index_ * BITMAP_SIZE_ + (threadIdx.x & 0x1F)); + } + */ + + __host__ __device__ addr_t addressDecoder(addr_t address_ptr_index) { + return getSuperBlockIndex(address_ptr_index) * SUPER_BLOCK_SIZE_ + + getMemBlockAddress(address_ptr_index) + + getMemUnitAddress(address_ptr_index); + } + + __host__ __device__ void print_address(addr_t address_ptr_index) { + printf("Super block Index: %d, Memory block index: %d, Memory unit " + "index: " + "%d\n", + getSuperBlockIndex(address_ptr_index), + getMemBlockIndex(address_ptr_index), + getMemUnitIndex(address_ptr_index)); + } + +private: + // a pointer to each super-block + uint32_t* super_blocks_; + + // hash_coef (register): used as (_LOG_NUM_SUPER_BLOCKS + _LOG_NUM_MEM_BLOCKS bits) for hashing + uint32_t hash_coef_; // a random 32-bit + + // resident_index: (register) + // should indicate what memory block and super block is currently resident + // (_LOG_NUM_MEM_BLOCKS bits + _LOG_NUM_SUPER_BLOCKS bits) + // (memory block + super block) + uint32_t num_attempts_; + uint32_t resident_index_; + uint32_t resident_bitmap_; + uint32_t super_block_index_; + uint32_t allocated_index_; // to be asked via shuffle after +}; + +template +__global__ void CountSlabsPerSuperblockKernel(SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> context, + uint32_t* slabs_per_superblock); + +/* + * This class owns the memory for the allocator on the device + */ +template +class SlabAlloc { +public: + using SlabAllocContextT = SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>; +protected: + // a pointer to each super-block + uint32_t* super_blocks_; + + // the context class is actually copied shallowly into GPU device + SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_alloc_context_; +// SlabAllocContextT slab_alloc_context_; + + std::shared_ptr<_Alloc> allocator_; +// _Alloc* allocator_; + +private: + SlabAlloc() : super_blocks_(nullptr) { + + allocator_ = std::make_shared<_Alloc>(); + // In the light version, we put num_super_blocks super blocks within + // a single array + super_blocks_ = allocator_->template allocate( + slab_alloc_context_.SUPER_BLOCK_SIZE_ * + slab_alloc_context_.num_super_blocks_); + + for (int i = 0; i < slab_alloc_context_.num_super_blocks_; i++) { + // setting bitmaps into zeros: + CHECK_CUDA(cudaMemset( + super_blocks_ + i * slab_alloc_context_.SUPER_BLOCK_SIZE_, + 0x00, + slab_alloc_context_.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * + slab_alloc_context_.BITMAP_SIZE_ * + sizeof(uint32_t))); + // setting empty memory units into ones: + CHECK_CUDA(cudaMemset( + super_blocks_ + i * slab_alloc_context_.SUPER_BLOCK_SIZE_ + + (slab_alloc_context_ + .NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * + slab_alloc_context_.BITMAP_SIZE_), + 0xFF, + slab_alloc_context_.MEM_BLOCK_SIZE_ * + slab_alloc_context_ + .NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * + sizeof(uint32_t))); + } + + // initializing the slab context: + slab_alloc_context_.Setup(super_blocks_); + } + ~SlabAlloc() { + allocator_->template deallocate(super_blocks_); + std::cout << "~SlabAlloc" << std::endl; + } + // Disable copy and assignment construction + SlabAlloc(const SlabAlloc&); + SlabAlloc& operator=(const SlabAlloc&); + +public: + static SlabAlloc<_Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>* getInstance() { + static SlabAlloc<_Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> localInstance = + SlabAlloc<_Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>(); + return &localInstance; + } + +public: + SlabAllocContextT& getContext() { return slab_alloc_context_; } + + std::vector CountSlabsPerSuperblock(); + /* + std::vector CountSlabsPerSuperblock() { + const uint32_t num_super_blocks = slab_alloc_context_.num_super_blocks_; + + auto slabs_per_superblock_buffer = + allocator_->template allocate(num_super_blocks); + thrust::device_vector slabs_per_superblock( + slabs_per_superblock_buffer, + slabs_per_superblock_buffer + num_super_blocks); + thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(), + 0); + + // counting total number of allocated memory units: + int blocksize = 128; + int num_mem_units = + slab_alloc_context_.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * 32; + int num_cuda_blocks = (num_mem_units + blocksize - 1) / blocksize; + CountSlabsPerSuperblockKernel<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES><<>>( + slab_alloc_context_, + thrust::raw_pointer_cast(slabs_per_superblock.data())); + + std::vector result(num_super_blocks); + thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(), + result.begin()); + allocator_->template deallocate(slabs_per_superblock_buffer); + return std::move(result); + } + */ +}; + +/* +template +__global__ void CountSlabsPerSuperblockKernel(SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> context, + uint32_t* slabs_per_superblock) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + int num_bitmaps = context.NUM_MEM_BLOCKS_PER_SUPER_BLOCK_ * 32; + if (tid >= num_bitmaps) { + return; + } + + for (int i = 0; i < context.num_super_blocks_; i++) { + uint32_t read_bitmap = *(context.get_ptr_for_bitmap(i, tid)); + atomicAdd(&slabs_per_superblock[i], __popc(read_bitmap)); + } +} +*/ + +} // namespace slab_hash diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.cu b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.cu new file mode 100644 index 00000000..1d5ca548 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.cu @@ -0,0 +1,1598 @@ +#include +#include "slab_hash.h" +#include "../coordinate.h" +#include "../cuda_unordered_map.h" + +namespace slab_hash { +/** + * Implementation for the host class + **/ +template +SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::SlabHash( + const uint32_t max_bucket_count, + const uint32_t max_keyvalue_count, + uint32_t device_idx) + : num_buckets_(max_bucket_count), + device_idx_(device_idx), + bucket_list_head_(nullptr) { + int32_t device_count = 0; + CHECK_CUDA(cudaGetDeviceCount(&device_count)); + assert(device_idx_ < device_count); + CHECK_CUDA(cudaSetDevice(device_idx_)); + + // allocate an initialize the allocator: + allocator_ = std::make_shared<_Alloc>(device_idx); + slab_list_allocator_ = SlabAlloc<_Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::getInstance(); + + assert(sizeof(_Value) % sizeof(ptr_t) == 0); + // allocating initial buckets: + bucket_list_head_ = allocator_->template allocate(num_buckets_ + + sizeof(_Value) / sizeof(ptr_t)); + cnt_value_ = reinterpret_cast<_Value*>(bucket_list_head_ + num_buckets_); + CHECK_CUDA( + cudaMemset(bucket_list_head_, 0xFF, sizeof(ptr_t) * num_buckets_)); + CHECK_CUDA( + cudaMemset(cnt_value_, 0x00, sizeof(_Value))); + + gpu_context_.Setup(bucket_list_head_, num_buckets_, + cnt_value_, + slab_list_allocator_->getContext()); + + // random coefficients for allocator's hash function + std::mt19937 rng(time(0)); + hash_coef_ = rng(); + + std::cout << "hash_coef_: " << hash_coef_ << std::endl; + + const uint32_t num_threads = num_buckets_ * WARP_WIDTH; + const uint32_t num_blocks = (num_threads + BLOCKSIZE_ - 1) / BLOCKSIZE_; + InitKernel<_Key, _Value, _Hash><<>>(gpu_context_, num_threads, hash_coef_); +} + +template +SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::~SlabHash() { + CHECK_CUDA(cudaSetDevice(device_idx_)); + + slab_list_allocator_->getContext() = gpu_context_.get_slab_alloc_ctx(); + auto slabs_per_super_block = slab_list_allocator_->CountSlabsPerSuperblock(); + int total_slabs_stored = std::accumulate( + slabs_per_super_block.begin(), slabs_per_super_block.end(), 0); + + std::cout << "Before total_slabs_stored: " << total_slabs_stored << std::endl; + + for (auto n : slabs_per_super_block) std::cout << n << '\t'; std::cout << std::endl; + + auto elems_per_bucket = CountElemsPerBucket(); + int total_elems_stored = std::accumulate(elems_per_bucket.begin(), + elems_per_bucket.end(), 0); + + printf("Before total_elems_stored: %d\n", total_elems_stored); + + const uint32_t num_threads = num_buckets_ * 32; + const uint32_t num_blocks = (num_threads + BLOCKSIZE_ - 1) / BLOCKSIZE_; + ReleaseKernel<_Key, _Value, _Hash><<>>(gpu_context_, num_threads); +// CHECK_CUDA(cudaDeviceSynchronize()); +// CHECK_CUDA(cudaGetLastError()); + + elems_per_bucket = CountElemsPerBucket(); + total_elems_stored = std::accumulate(elems_per_bucket.begin(), + elems_per_bucket.end(), 0); + + printf("After total_elems_stored: %d\n", total_elems_stored); + + slabs_per_super_block = slab_list_allocator_->CountSlabsPerSuperblock(); + total_slabs_stored = std::accumulate( + slabs_per_super_block.begin(), slabs_per_super_block.end(), 0); + + std::cout << "After total_slabs_stored: " << total_slabs_stored << std::endl; + + for (auto n : slabs_per_super_block) { + std::cout << n << '\t'; +// assert(n == 0); + } + std::cout << std::endl; + + allocator_->template deallocate(bucket_list_head_); + std::cout << num_buckets_ << std::endl; +} + +template +_Value SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::Size() { + + _Value cnt_value; + + CHECK_CUDA(cudaMemcpy(&cnt_value, cnt_value_, + sizeof(_Value), + cudaMemcpyDeviceToHost)); + + return cnt_value; +} + +template +_Value* SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::SizePtr() { + return cnt_value_; +} + +template +void SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::InsertAtomic(_Key* keys, + uint32_t num_keys) { + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + // calling the kernel for bulk build: + CHECK_CUDA(cudaSetDevice(device_idx_)); + InsertAtomicKernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>(gpu_context_, keys, num_keys, hash_coef_); +// CHECK_CUDA(cudaDeviceSynchronize()); +// CHECK_CUDA(cudaGetLastError()); +} + +template +void SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::Search(_Key* keys, + _Value* values, + uint8_t* founds, + uint32_t num_keys) { + CHECK_CUDA(cudaSetDevice(device_idx_)); + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + SearchKernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>( + gpu_context_, keys, values, founds, num_keys); +// CHECK_CUDA(cudaDeviceSynchronize()); +// CHECK_CUDA(cudaGetLastError()); +} + +template +void SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::Remove(_Key* keys, + uint32_t num_keys) { + std::cout << "Enter Remove" << std::endl; + CHECK_CUDA(cudaSetDevice(device_idx_)); + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + RemoveKernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>(gpu_context_, keys, num_keys); +// CHECK_CUDA(cudaDeviceSynchronize()); +// CHECK_CUDA(cudaGetLastError()); +} + +template +void SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::InsertAtomic_( + _Key* keys, + _Iterator<_Key, _Value>* iterators, + uint8_t* masks, + uint32_t num_keys) { + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + // calling the kernel for bulk build: + CHECK_CUDA(cudaSetDevice(device_idx_)); + InsertAtomic_Kernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>( + gpu_context_, keys, iterators, masks, num_keys, hash_coef_); +// CHECK_CUDA(cudaDeviceSynchronize()); +// CHECK_CUDA(cudaGetLastError()); +} + +template +void SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::Search_(_Key* keys, + _Iterator<_Key, _Value>* iterators, + uint8_t* masks, + uint32_t num_keys) { + CHECK_CUDA(cudaSetDevice(device_idx_)); + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + Search_Kernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>( + gpu_context_, keys, iterators, masks, num_keys); +// CHECK_CUDA(cudaDeviceSynchronize()); +// CHECK_CUDA(cudaGetLastError()); +} + +template +void SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::Remove_(_Key* keys, + uint8_t* masks, + uint32_t num_keys) { + CHECK_CUDA(cudaSetDevice(device_idx_)); + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + Remove_Kernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>(gpu_context_, keys, masks, num_keys); +// CHECK_CUDA(cudaDeviceSynchronize()); +// CHECK_CUDA(cudaGetLastError()); +} + +/* Debug usage */ +template +std::vector SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::CountElemsPerBucket() { + std::cout << "num_buckets_: " << num_buckets_ << std::endl; + + auto elems_per_bucket_buffer = + allocator_->template allocate(num_buckets_); + + thrust::device_vector elems_per_bucket( + elems_per_bucket_buffer, elems_per_bucket_buffer + num_buckets_); + thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0); + + const uint32_t blocksize = 128; + const uint32_t num_blocks = (num_buckets_ * 32 + blocksize - 1) / blocksize; + CountElemsPerBucketKernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>( + gpu_context_, thrust::raw_pointer_cast(elems_per_bucket.data()), hash_coef_); + + std::vector result(num_buckets_); + thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(), + result.begin()); + allocator_->template deallocate(elems_per_bucket_buffer); + return std::move(result); +} + +/* Debug usage */ +template +void SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::CountElems(int* count) { + + _Value cnt_value = Size(); + + auto values_buffer = + allocator_->template allocate(cnt_value); + + auto index_buffer = + allocator_->template allocate(1); + + thrust::device_vector values( + values_buffer, values_buffer + cnt_value); + + thrust::device_vector index( + index_buffer, index_buffer + 1); + thrust::fill(index.begin(), index.end(), 0); + + const uint32_t blocksize = 128; + const uint32_t num_blocks = (num_buckets_ * 32 + blocksize - 1) / blocksize; + std::cout << "Before CountElemsKernel" << std::endl; + CountElemsKernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>( + gpu_context_, + thrust::raw_pointer_cast(values.data()), + thrust::raw_pointer_cast(index.data()), + count + ); + std::cout << "After CountElemsKernel" << std::endl; + + std::vector sorted_values(cnt_value); + std::vector cnt(1); + thrust::copy(values.begin(), values.end(), + sorted_values.begin()); + thrust::copy(index.begin(), index.end(), + cnt.begin()); + allocator_->template deallocate(values_buffer); + allocator_->template deallocate(index_buffer); + + std::cout << "Total Values: " << cnt[0] << std::endl; + std::sort(sorted_values.begin(), sorted_values.begin() + cnt[0]); + for (int i = 0; i != cnt[0]; ++i) { + if (i != sorted_values[i]) std::cout << i << '\t' << sorted_values[i] << std::endl; + assert(i == sorted_values[i]); + } +} + +template +double SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::ComputeLoadFactor() { + auto elems_per_bucket = CountElemsPerBucket(); + int total_elems_stored = std::accumulate(elems_per_bucket.begin(), + elems_per_bucket.end(), 0); + + slab_list_allocator_->getContext() = gpu_context_.get_slab_alloc_ctx(); + auto slabs_per_super_block = slab_list_allocator_->CountSlabsPerSuperblock(); + int total_slabs_stored = std::accumulate( + slabs_per_super_block.begin(), slabs_per_super_block.end(), num_buckets_); + + double load_factor = double(total_elems_stored) / + double(total_slabs_stored * WARP_WIDTH); + + return load_factor; +} + +//////////////// +template +void +SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>:: +BulkInsertWithMapping(const int* p_coords, + int* p_mapping, + int* p_inverse_mapping, + int num_keys) { + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + // calling the kernel for bulk build: + CHECK_CUDA(cudaSetDevice(device_idx_)); + BulkInsertWithMappingKernel<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + <<>>(gpu_context_, p_coords, + p_mapping, p_inverse_mapping, + num_keys, hash_coef_); +// CHECK_CUDA(cudaDeviceSynchronize()); +// CHECK_CUDA(cudaGetLastError()); +} + +template +void +SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>:: +IterateKeys(int* p_coords, int size) { +} + +template +void +SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>:: +IterateSearchAtBatch(int* p_out, int batch_index, int size) {} + +template +void +SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>:: +IterateSearchPerBatch(const std::vector& p_outs, int size) {} + +template +void +SlabHash<_Key, _Value, _Hash, _Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>:: +IterateOffsetInsert(const std::shared_ptr>& in_map, + int* p_offset, int size) {} +//////////////// + +/** + * Definitions + **/ +template +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::SlabHashContext() + : num_buckets_(0), bucket_list_head_(nullptr) { +} + +template +__global__ void InitKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + const uint32_t num_threads, + uint32_t hash_coef) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= num_threads) return; + uint32_t lane_id = tid & 0x1F; + uint32_t bucket_id = (tid >> 5); + + + slab_hash_ctx.get_slab_alloc_ctx().Init(hash_coef, tid, lane_id); + + ptr_t new_next_slab_ptr = slab_hash_ctx.get_slab_alloc_ctx().WarpAllocate(lane_id); + + if (lane_id == NEXT_SLAB_PTR_LANE) { + + const uint32_t* unit_data_ptr = + slab_hash_ctx.get_slab_ptr_from_list_head(bucket_id); + +// TODO(ljm): Ideal should be OK. + //Ideal: +// *((unsigned int*)unit_data_ptr) = new_next_slab_ptr; + + ptr_t old_next_slab_ptr = + atomicCAS((unsigned int*)unit_data_ptr, + EMPTY_SLAB_PTR, new_next_slab_ptr); + + if (old_next_slab_ptr != EMPTY_SLAB_PTR) { + slab_hash_ctx.get_slab_alloc_ctx().FreeUntouched(new_next_slab_ptr); + } + } + +} + +template +__global__ void ReleaseKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + const uint32_t num_threads) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= num_threads) return; + uint32_t lane_id = (tid & 0x1F); + uint32_t bucket_id = (tid >> 5); + + ptr_t* curr_slab_ptr_ptr = NULL; + ptr_t curr_slab_ptr = EMPTY_SLAB_PTR; + if (lane_id == NEXT_SLAB_PTR_LANE) { + curr_slab_ptr_ptr = + slab_hash_ctx.get_slab_ptr_from_list_head(bucket_id); + curr_slab_ptr = *(curr_slab_ptr_ptr); + } + curr_slab_ptr = + __shfl_sync(ACTIVE_LANES_MASK, curr_slab_ptr, + NEXT_SLAB_PTR_LANE, WARP_WIDTH); + while (curr_slab_ptr != EMPTY_SLAB_PTR) { + + ptr_t* unit_data_ptr = + slab_hash_ctx.get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id); + + if (lane_id != NEXT_SLAB_PTR_LANE) { + ptr_t old_first_key = atomicExch(unit_data_ptr, EMPTY_PAIR_PTR); + if (old_first_key != EMPTY_PAIR_PTR) { + slab_hash_ctx.ClearRemainPair(unit_data_ptr); + } + } else { + // set empty first + *(curr_slab_ptr_ptr) = EMPTY_SLAB_PTR; + // no need atomicExch ideally + // atomicExch(curr_slab_ptr_ptr, EMPTY_PAIR_PTR); + // then, free untouched + slab_hash_ctx.get_slab_alloc_ctx().FreeUntouched(curr_slab_ptr); + curr_slab_ptr_ptr = unit_data_ptr; + curr_slab_ptr = *(curr_slab_ptr_ptr); + } + curr_slab_ptr = + __shfl_sync(ACTIVE_LANES_MASK, curr_slab_ptr, + NEXT_SLAB_PTR_LANE, WARP_WIDTH); + } +} + +template +__device__ __host__ __forceinline__ uint32_t +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::ComputeBucket( + const _Key& key) const { + return hash_fn_(key) % num_buckets_; +} + +template +__device__ __forceinline__ void +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::WarpSyncKey( + const _Key& key, + const uint32_t lane_id, + _Key& ret) { +#pragma unroll 1 + for (size_t i = 0; i != key_chunks; ++i) { + ((int*)(&ret))[i] = __shfl_sync(ACTIVE_LANES_MASK, ((int*)(&key))[i], + lane_id, WARP_WIDTH); + } +} + +template +__device__ __forceinline__ void +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::ClearRemainPair(ptr_t* ptr) { +#pragma unroll 1 + for (size_t i = 1; + i != key_chunks + value_chunks; + ++i) { + ptr[i] = EMPTY_SLAB_PTR; + } +} + +template +__device__ __forceinline__ void +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::CopyRemainPair( + ptr_t* ptr, + const _Key& key, + const _Value& value) { +#pragma unroll 1 + for (size_t i = 1; i < key_chunks; ++i) { + ((int*)(ptr))[i] = ((int*)(&key))[i]; + } +#pragma unroll 1 + for (size_t i = 0; i < value_chunks; ++i) { + ((int*)(ptr))[key_chunks + i] = ((int*)(&value))[i]; + } +} + +template +__device__ int32_t +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::WarpFindKey( + const _Key& key, const uint32_t lane_id, const ptr_t* ptr) { + uint8_t is_lane_found = + /* select key lanes */ + ((1 << lane_id) & PAIR_PTR_LANES_MASK) + && (reinterpret_cast*>(ptr))->first == key; + + return __ffs(__ballot_sync(PAIR_PTR_LANES_MASK, is_lane_found)) - 1; +} + +template +__device__ __forceinline__ int32_t +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::WarpFindEmpty(const ptr_t* ptr) { + //assert(value_chunks == 1); + uint8_t is_lane_empty = (reinterpret_cast(ptr)[_MEM_UNIT_WARP_MULTIPLES - 1] == EMPTY_PAIR_PTR); + return __ffs(__ballot_sync(PAIR_PTR_LANES_MASK, is_lane_empty)) - 1; +} + +template +__device__ __forceinline__ ptr_t +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::AllocateSlab( + const uint32_t lane_id) { + return slab_list_allocator_ctx_.WarpAllocate(lane_id); +} + +template +__device__ __forceinline__ void +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::FreeSlab( + const ptr_t slab_ptr) { + slab_list_allocator_ctx_.FreeUntouched(slab_ptr); +} + +template +__device__ _Pair +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::Search( + uint8_t& to_search, + const uint32_t lane_id, + const uint32_t bucket_id, + const _Key& query_key) { + uint32_t work_queue = 0; + uint32_t prev_work_queue = work_queue; + uint32_t curr_slab_ptr = EMPTY_SLAB_PTR; + + ptr_t* iterator = NULL; + uint8_t mask = false; + + /** > Loop when we have active lanes **/ + while ((work_queue = __ballot_sync(ACTIVE_LANES_MASK, to_search))) { + /** 0. Restart from linked list head if the last query is finished + * **/ + uint32_t src_lane = __ffs(work_queue) - 1; + uint32_t src_bucket = + __shfl_sync(ACTIVE_LANES_MASK, bucket_id, src_lane, WARP_WIDTH); + + _Key src_key; + WarpSyncKey(query_key, src_lane, src_key); + + curr_slab_ptr = + (prev_work_queue != work_queue) + ? *(get_slab_ptr_from_list_head(src_bucket)) + : curr_slab_ptr; + + /* Each lane in the warp reads a uint in the slab in parallel */ + const uint32_t* unit_data_ptr = + get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id); + + int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data_ptr); + + /** 1. Found in this slab, SUCCEED **/ + if (lane_found >= 0) { + /* broadcast found value */ + uint64_t found_pair_internal_ptr = __shfl_sync( + ACTIVE_LANES_MASK, reinterpret_cast(unit_data_ptr), lane_found, WARP_WIDTH); + + if (lane_id == src_lane) { + to_search = false; + + iterator = reinterpret_cast(found_pair_internal_ptr); + mask = true; + } + } + + /** 2. Not found in this slab **/ + else { + ptr_t unit_data = *(reinterpret_cast(unit_data_ptr)); + /* broadcast next slab: lane 31 reads 'next' */ + ptr_t next_slab_ptr = __shfl_sync(ACTIVE_LANES_MASK, unit_data, + NEXT_SLAB_PTR_LANE, WARP_WIDTH); + + /** 2.1. Next slab is empty, ABORT **/ + if (next_slab_ptr == EMPTY_SLAB_PTR) { + if (lane_id == src_lane) { + to_search = false; + } + } + /** 2.2. Next slab exists, RESTART **/ + else { + curr_slab_ptr = next_slab_ptr; + } + } + + prev_work_queue = work_queue; + } + + return _make_pair(iterator, mask); +} + +/* + * Insert: ABORT if found + * replacePair: REPLACE if found + * WE DO NOT ALLOW DUPLICATE KEYS + */ +template +__device__ _Pair +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::InsertAtomic( + uint8_t& to_be_inserted, + const uint32_t lane_id, + const uint32_t bucket_id, + const _Key& key) { + uint32_t work_queue = 0; + uint32_t prev_work_queue = 0; + uint32_t curr_slab_ptr = EMPTY_SLAB_PTR; + + ptr_t* iterator = NULL; + uint8_t mask = false; + + const uint32_t first_key = *reinterpret_cast(&key); + + /** > Loop when we have active lanes **/ + while ((work_queue = __ballot_sync(ACTIVE_LANES_MASK, to_be_inserted))) { + /** 0. Restart from linked list head if last insertion is finished + * **/ + uint32_t src_lane = __ffs(work_queue) - 1; + uint32_t src_bucket = + __shfl_sync(ACTIVE_LANES_MASK, bucket_id, src_lane, WARP_WIDTH); + + curr_slab_ptr = + (prev_work_queue != work_queue) + ? *(get_slab_ptr_from_list_head(src_bucket)) + : curr_slab_ptr; + + /* Each lane in the warp reads a uint in the slab */ + uint32_t* unit_data_ptr = + get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id); + + int32_t lane_empty = WarpFindEmpty(unit_data_ptr); + _Key src_key; + WarpSyncKey(key, src_lane, src_key); + int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data_ptr); + + /** Branch 1: key already existing, ABORT **/ + if (lane_found >= 0) { + if (lane_id == src_lane) { + /* free memory heap */ + to_be_inserted = false; + } + } + + /** Branch 2: empty slot available, try to insert **/ + else if (lane_empty >= 0) { + uint64_t lane_empty_data_ptr = __shfl_sync(ACTIVE_LANES_MASK, + reinterpret_cast(unit_data_ptr), lane_empty, WARP_WIDTH); + unit_data_ptr = reinterpret_cast(lane_empty_data_ptr); + if (lane_id == src_lane) { + + uint32_t old_first_data = + atomicCAS(unit_data_ptr, EMPTY_PAIR_PTR, first_key); + + /** Branch 2.1: SUCCEED **/ + if (old_first_data == EMPTY_PAIR_PTR) { + // copy the remaining data + _Value value = atomicAdd(cnt_value_, 1); + + CopyRemainPair(unit_data_ptr, key, value); + to_be_inserted = false; + + iterator = unit_data_ptr; + mask = true; + } + + /** Branch 2.2: failed: RESTART + * In the consequent attempt, + * > if the same key was inserted in this slot, + * we fall back to Branch 1; + * > if a different key was inserted, + * we go to Branch 2 or 3. + * **/ + } + } + + /** Branch 3: nothing found in this slab, goto next slab **/ + else { + /* broadcast next slab */ + ptr_t next_slab_ptr = __shfl_sync(ACTIVE_LANES_MASK, *reinterpret_cast(unit_data_ptr), + NEXT_SLAB_PTR_LANE, WARP_WIDTH); + + /** Branch 3.1: next slab existing, RESTART this lane **/ + if (next_slab_ptr != EMPTY_SLAB_PTR) { + curr_slab_ptr = next_slab_ptr; + } + + /** Branch 3.2: next slab empty, try to allocate one **/ + else { + ptr_t new_next_slab_ptr = AllocateSlab(lane_id); + + if (lane_id == NEXT_SLAB_PTR_LANE) { + const uint32_t* unit_data_ptr = + get_unit_ptr_from_list_nodes( + curr_slab_ptr, + NEXT_SLAB_PTR_LANE); + + ptr_t old_next_slab_ptr = + atomicCAS((unsigned int*)unit_data_ptr, + EMPTY_SLAB_PTR, new_next_slab_ptr); + + /** Branch 3.2.1: other thread allocated, RESTART lane + * In the consequent attempt, goto Branch 2' **/ + if (old_next_slab_ptr != EMPTY_SLAB_PTR) { + FreeSlab(new_next_slab_ptr); + } + /** Branch 3.2.2: this thread allocated, RESTART lane, + * 'goto Branch 2' **/ + } + } + } + + prev_work_queue = work_queue; + } + + return _make_pair(iterator, mask); +} + +template +__device__ uint8_t +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::Remove( + uint8_t& to_be_deleted, + const uint32_t lane_id, + const uint32_t bucket_id, + const _Key& key) { + uint32_t work_queue = 0; + uint32_t prev_work_queue = 0; + uint32_t curr_slab_ptr = EMPTY_SLAB_PTR; + + uint8_t mask = false; + + /** > Loop when we have active lanes **/ + while ((work_queue = __ballot_sync(ACTIVE_LANES_MASK, to_be_deleted))) { + /** 0. Restart from linked list head if last insertion is finished + * **/ + uint32_t src_lane = __ffs(work_queue) - 1; + uint32_t src_bucket = + __shfl_sync(ACTIVE_LANES_MASK, bucket_id, src_lane, WARP_WIDTH); + + _Key src_key; + WarpSyncKey(key, src_lane, src_key); + + curr_slab_ptr = + (prev_work_queue != work_queue) + ? *(get_slab_ptr_from_list_head(src_bucket)) + : curr_slab_ptr; + + const uint32_t* unit_data_ptr = + get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id); + + int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data_ptr); + + /** Branch 1: key found **/ + if (lane_found >= 0) { + + if (lane_id == src_lane) { + uint32_t* unit_data_ptr = + get_unit_ptr_from_list_nodes(curr_slab_ptr, + lane_found); + ptr_t pair_to_delete = *reinterpret_cast(&src_key); + + // TODO: keep in mind the potential double free problem + ptr_t old_key_value_pair = + atomicCAS((unsigned int*)(unit_data_ptr), + pair_to_delete, EMPTY_PAIR_PTR); + + /** Branch 1.1: this thread reset, free src_addr **/ + if (old_key_value_pair == pair_to_delete) { + ClearRemainPair(unit_data_ptr); + mask = true; + } + /** Branch 1.2: other thread did the job, avoid double free + * **/ + to_be_deleted = false; + } + } else { // no matching slot found: + ptr_t unit_data = *(reinterpret_cast(unit_data_ptr)); + ptr_t next_slab_ptr = __shfl_sync(ACTIVE_LANES_MASK, unit_data, + NEXT_SLAB_PTR_LANE, WARP_WIDTH); + if (next_slab_ptr == EMPTY_SLAB_PTR) { + // not found: + if (lane_id == src_lane) { + to_be_deleted = false; + } + } else { + curr_slab_ptr = next_slab_ptr; + } + } + prev_work_queue = work_queue; + } + + return mask; +} + +///////////////// +template +__device__ void +SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::BulkInsertWithMapping( + uint8_t& to_be_inserted, + const uint32_t lane_id, + const uint32_t bucket_id, + const _Key& key, + int* p_mapping, + int* p_inverse_mapping, + int key_idx) { + uint32_t work_queue = 0; + uint32_t prev_work_queue = 0; + uint32_t curr_slab_ptr = EMPTY_SLAB_PTR; + + //ptr_t* iterator = NULL; + //uint8_t mask = false; + + const uint32_t first_key = *reinterpret_cast(&key); + + /** > Loop when we have active lanes **/ + while ((work_queue = __ballot_sync(ACTIVE_LANES_MASK, to_be_inserted))) { + /** 0. Restart from linked list head if last insertion is finished + * **/ + uint32_t src_lane = __ffs(work_queue) - 1; + uint32_t src_bucket = + __shfl_sync(ACTIVE_LANES_MASK, bucket_id, src_lane, WARP_WIDTH); + + curr_slab_ptr = + (prev_work_queue != work_queue) + ? *(get_slab_ptr_from_list_head(src_bucket)) + : curr_slab_ptr; + + /* Each lane in the warp reads a uint in the slab */ + uint32_t* unit_data_ptr = + get_unit_ptr_from_list_nodes(curr_slab_ptr, lane_id); + + int32_t lane_empty = WarpFindEmpty(unit_data_ptr); + _Key src_key; + WarpSyncKey(key, src_lane, src_key); + int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data_ptr); + + /** Branch 1: key already existing, ABORT **/ + if (lane_found >= 0) { + uint64_t found_pair_internal_ptr = __shfl_sync( + ACTIVE_LANES_MASK, reinterpret_cast(unit_data_ptr), lane_found, WARP_WIDTH); + + if (lane_id == src_lane) { + /// + p_inverse_mapping[key_idx] = static_cast( + *reinterpret_cast<_Value*>( + reinterpret_cast( + found_pair_internal_ptr) + + key_chunks) + ); + /// + to_be_inserted = false; + } + } + + /** Branch 2: empty slot available, try to insert **/ + else if (lane_empty >= 0) { + uint64_t lane_empty_data_ptr = __shfl_sync(ACTIVE_LANES_MASK, + reinterpret_cast(unit_data_ptr), lane_empty, WARP_WIDTH); + unit_data_ptr = reinterpret_cast(lane_empty_data_ptr); + if (lane_id == src_lane) { + + uint32_t old_first_data = + atomicCAS(unit_data_ptr, EMPTY_PAIR_PTR, first_key); + + /** Branch 2.1: SUCCEED **/ + if (old_first_data == EMPTY_PAIR_PTR) { + // copy the remaining data + _Value value = atomicAdd(cnt_value_, 1); + + CopyRemainPair(unit_data_ptr, key, value); + /// + p_mapping[value] = key_idx; + p_inverse_mapping[key_idx] = value; + /// + to_be_inserted = false; + + //iterator = unit_data_ptr; + //mask = true; + } + + /** Branch 2.2: failed: RESTART + * In the consequent attempt, + * > if the same key was inserted in this slot, + * we fall back to Branch 1; + * > if a different key was inserted, + * we go to Branch 2 or 3. + * **/ + } + } + + /** Branch 3: nothing found in this slab, goto next slab **/ + else { + /* broadcast next slab */ + ptr_t next_slab_ptr = __shfl_sync(ACTIVE_LANES_MASK, *reinterpret_cast(unit_data_ptr), + NEXT_SLAB_PTR_LANE, WARP_WIDTH); + + /** Branch 3.1: next slab existing, RESTART this lane **/ + if (next_slab_ptr != EMPTY_SLAB_PTR) { + curr_slab_ptr = next_slab_ptr; + } + + /** Branch 3.2: next slab empty, try to allocate one **/ + else { + ptr_t new_next_slab_ptr = AllocateSlab(lane_id); + + if (lane_id == NEXT_SLAB_PTR_LANE) { + const uint32_t* unit_data_ptr = + get_unit_ptr_from_list_nodes( + curr_slab_ptr, + NEXT_SLAB_PTR_LANE); + + ptr_t old_next_slab_ptr = + atomicCAS((unsigned int*)unit_data_ptr, + EMPTY_SLAB_PTR, new_next_slab_ptr); + + /** Branch 3.2.1: other thread allocated, RESTART lane + * In the consequent attempt, goto Branch 2' **/ + if (old_next_slab_ptr != EMPTY_SLAB_PTR) { + FreeSlab(new_next_slab_ptr); + } + /** Branch 3.2.2: this thread allocated, RESTART lane, + * 'goto Branch 2' **/ + } + } + } + + prev_work_queue = work_queue; + } + +// return _make_pair(iterator, mask); +} + +///////////////// + +template +__global__ void +SearchKernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* keys, + _Value* values, + uint8_t* founds, + uint32_t num_queries) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + /* This warp is idle */ + if ((tid - lane_id) >= num_queries) { + return; + } + + uint8_t lane_active = false; + uint32_t bucket_id = 0; + _Key key; + + if (tid < num_queries) { + lane_active = true; + key = keys[tid]; + bucket_id = slab_hash_ctx.ComputeBucket(key); + } + + _Pair result = + slab_hash_ctx.Search(lane_active, lane_id, bucket_id, key); + + if (tid < num_queries) { + uint8_t found = result.second; + founds[tid] = found; + values[tid] = found ? reinterpret_cast<_Pair<_Key, _Value>*>(result.first) + ->second + : _Value(0); + + } +} + +template +__global__ void +InsertAtomicKernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + slab_hash_ctx, + _Key* keys, + uint32_t num_keys, + uint32_t hash_coef) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + if ((tid - lane_id) >= num_keys) { + return; + } + + slab_hash_ctx.get_slab_alloc_ctx().Init(hash_coef, tid, lane_id); + + uint8_t lane_active = false; + uint32_t bucket_id = 0; + _Key key; + + if (tid < num_keys) { + lane_active = true; + key = keys[tid]; + bucket_id = slab_hash_ctx.ComputeBucket(key); + } + + slab_hash_ctx.InsertAtomic(lane_active, lane_id, bucket_id, key); +} + +template +__global__ void +RemoveKernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* keys, + uint32_t num_keys) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + if ((tid - lane_id) >= num_keys) { + return; + } + + uint8_t lane_active = false; + uint32_t bucket_id = 0; + _Key key; + + if (tid < num_keys) { + lane_active = true; + key = keys[tid]; + bucket_id = slab_hash_ctx.ComputeBucket(key); + } + + slab_hash_ctx.Remove(lane_active, lane_id, bucket_id, key); +} + +template +__global__ void +Search_Kernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* keys, + _Iterator<_Key, _Value>* iterators, + uint8_t* masks, + uint32_t num_queries) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + /* This warp is idle */ + if ((tid - lane_id) >= num_queries) { + return; + } + + uint8_t lane_active = false; + uint32_t bucket_id = 0; + _Key key; + + if (tid < num_queries) { + lane_active = true; + key = keys[tid]; + bucket_id = slab_hash_ctx.ComputeBucket(key); + } + + _Pair result = + slab_hash_ctx.Search(lane_active, lane_id, bucket_id, key); + + if (tid < num_queries) { + iterators[tid] = reinterpret_cast<_Pair<_Key, _Value>*>(result.first); + masks[tid] = result.second; + } +} + +template +__global__ void +InsertAtomic_Kernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + slab_hash_ctx, + _Key* keys, + _Iterator<_Key, _Value>* iterators, + uint8_t* masks, + uint32_t num_keys, + uint32_t hash_coef) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + if ((tid - lane_id) >= num_keys) { + return; + } + + slab_hash_ctx.get_slab_alloc_ctx().Init(hash_coef, tid, lane_id); + + uint8_t lane_active = false; + uint32_t bucket_id = 0; + _Key key; + + if (tid < num_keys) { + lane_active = true; + key = keys[tid]; + bucket_id = slab_hash_ctx.ComputeBucket(key); + } + + _Pair result = + slab_hash_ctx.InsertAtomic(lane_active, lane_id, bucket_id, key); + + if (tid < num_keys) { + iterators[tid] = reinterpret_cast<_Pair<_Key, _Value>*>(result.first); + masks[tid] = result.second; + } +} + +template +__global__ void +Remove_Kernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* keys, + uint8_t* masks, + uint32_t num_keys) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + if ((tid - lane_id) >= num_keys) { + return; + } + + uint8_t lane_active = false; + uint32_t bucket_id = 0; + _Key key; + + if (tid < num_keys) { + lane_active = true; + key = keys[tid]; + bucket_id = slab_hash_ctx.ComputeBucket(key); + } + + uint8_t success = + slab_hash_ctx.Remove(lane_active, lane_id, bucket_id, key); + + if (tid < num_keys) { + masks[tid] = success; + } +} + +template +__global__ void GetIteratorsKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + ptr_t* iterators, + uint32_t* iterator_count, + uint32_t num_buckets) { + // global warp ID + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t wid = tid >> 5; + // assigning a warp per bucket + if (wid >= num_buckets) { + return; + } + + /* uint32_t lane_id = threadIdx.x & 0x1F; */ + + /* // initializing the memory allocator on each warp: */ + /* slab_hash_ctx.get_slab_alloc_ctx().Init(tid, lane_id); */ + + /* uint32_t src_unit_data = */ + /* *slab_hash_ctx.get_unit_ptr_from_list_head(wid, lane_id); */ + /* uint32_t active_mask = */ + /* __ballot_sync(PAIR_PTR_LANES_MASK, src_unit_data != + * EMPTY_PAIR_PTR); */ + /* int leader = __ffs(active_mask) - 1; */ + /* uint32_t count = __popc(active_mask); */ + /* uint32_t rank = __popc(active_mask & __lanemask_lt()); */ + /* uint32_t prev_count; */ + /* if (rank == 0) { */ + /* prev_count = atomicAdd(iterator_count, count); */ + /* } */ + /* prev_count = __shfl_sync(active_mask, prev_count, leader); */ + + /* if (src_unit_data != EMPTY_PAIR_PTR) { */ + /* iterators[prev_count + rank] = src_unit_data; */ + /* } */ + + /* uint32_t next = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); */ + /* while (next != EMPTY_SLAB_PTR) { */ + /* src_unit_data = */ + /* *slab_hash_ctx.get_unit_ptr_from_list_nodes(next, + * lane_id); + */ + /* count += __popc(__ballot_sync(PAIR_PTR_LANES_MASK, */ + /* src_unit_data != EMPTY_PAIR_PTR)); + */ + /* next = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); */ + /* } */ + /* // writing back the results: */ + /* if (lane_id == 0) { */ + /* } */ +} + +template +__global__ void CountElemsKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + uint32_t* values, uint32_t* index, int* count) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + // assigning a warp per bucket + uint32_t wid = tid >> 5; + if (wid >= slab_hash_ctx.bucket_size()) { + return; + } + + uint32_t* src_unit_data_ptr = NULL; + ptr_t next = + *slab_hash_ctx.get_slab_ptr_from_list_head(wid); + + // count following nodes + while (next != EMPTY_SLAB_PTR) { +// /* + src_unit_data_ptr = + slab_hash_ctx.get_unit_ptr_from_list_nodes(next, lane_id); + if (NEXT_SLAB_PTR_LANE != lane_id && + src_unit_data_ptr[slab_hash_ctx.key_chunks + slab_hash_ctx.value_chunks - 1] != EMPTY_PAIR_PTR) { + values[atomicAdd(index, 1)] = src_unit_data_ptr[slab_hash_ctx.key_chunks + slab_hash_ctx.value_chunks - 1]; + /* + printf("%d %d %d %d\n", src_unit_data_ptr[3], + src_unit_data_ptr[0], + src_unit_data_ptr[1], + src_unit_data_ptr[2]); + */ + } + +// /* + /////////// + // TODO(ljm): Warning: handle protential overflow + for (int d = 0; d != 3; ++d) { + _Key key = reinterpret_cast<_Pair<_Key, _Value>*>(src_unit_data_ptr)->first; + key[d] += 1; + uint8_t lane_active = (NEXT_SLAB_PTR_LANE != lane_id) && + (src_unit_data_ptr[_MEM_UNIT_WARP_MULTIPLES - 1] != EMPTY_PAIR_PTR); + uint32_t bucket_id = slab_hash_ctx.ComputeBucket(key); + _Pair result = + slab_hash_ctx.Search(lane_active, lane_id, bucket_id, key); + if (result.second) { + atomicSub(count + d, 1); + /* + printf("found key[%d] + 1: %d\t%d\t%d --- %d\t%d\t%d\n", + d, key[0], key[1], key[2], + result.first[0], result.first[1], result.first[2]); + */ + } + } + /////////// +// */ + next = __shfl_sync(ACTIVE_LANES_MASK, *src_unit_data_ptr, NEXT_SLAB_PTR_LANE, + WARP_WIDTH); + } +} + +/* + * This kernel can be used to compute total number of elements within each + * bucket. The final results per bucket is stored in d_count_result array + */ +template +__global__ void CountElemsPerBucketKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + uint32_t* bucket_elem_counts, uint32_t hash_coef) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + // assigning a warp per bucket + uint32_t wid = tid >> 5; + if (wid >= slab_hash_ctx.bucket_size()) { + return; + } + + uint32_t count = 0; + + uint32_t src_unit_data = EMPTY_PAIR_PTR; + ptr_t next = + *slab_hash_ctx.get_slab_ptr_from_list_head(wid); + + // count following nodes + while (next != EMPTY_SLAB_PTR) { + src_unit_data = + *slab_hash_ctx.get_unit_ptr_from_list_nodes(next, lane_id); + count += __popc(__ballot_sync(PAIR_PTR_LANES_MASK, + src_unit_data != EMPTY_PAIR_PTR)); + next = __shfl_sync(ACTIVE_LANES_MASK, src_unit_data, NEXT_SLAB_PTR_LANE, + WARP_WIDTH); + } + + // write back the results: + if (lane_id == 0) { + bucket_elem_counts[wid] = count; + } +} + +//////////////// + +template +__global__ void +BulkInsertWithMappingKernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> + slab_hash_ctx, +// _Key* keys, + const int* p_coords, + int* p_mapping, + int* p_inverse_mapping, + uint32_t num_keys, + uint32_t hash_coef) { + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t lane_id = threadIdx.x & 0x1F; + + if ((tid - lane_id) >= num_keys) { + return; + } + + slab_hash_ctx.get_slab_alloc_ctx().Init(hash_coef, tid, lane_id); + + uint8_t lane_active = false; + uint32_t bucket_id = 0; + _Key key; + + if (tid < num_keys) { + lane_active = true; +// key = keys[tid]; + key = *(reinterpret_cast(p_coords + tid * + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>::key_chunks)); + bucket_id = slab_hash_ctx.ComputeBucket(key); + } + + slab_hash_ctx.BulkInsertWithMapping(lane_active, lane_id, bucket_id, key, + p_mapping, p_inverse_mapping, tid); + /* + p_inverse_mapping[tid] = static_cast(idx); + p_mapping[cnt] = static_cast(tid); + */ +} +//////////////// +using Key = Coordinate; +template class SlabHash, CudaAllocator, 5, 5, 5>; +template class SlabHashContext, 5, 5, 5>; +} // namespace slab_hash diff --git a/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.h b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.h new file mode 100644 index 00000000..cac15a39 --- /dev/null +++ b/src/3rdparty/gpu_coords_map/include/slab_hash/slab_hash.h @@ -0,0 +1,389 @@ +/* + * Copyright 2019 Saman Ashkiani + * Modified 2019 by Wei Dong + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "slab_alloc.h" + +template +struct _Pair { + _Key first; + _Value second; + __device__ __host__ _Pair(const _Key& key, const _Value& value) + : first(key), second(value) {} + __device__ __host__ _Pair() : first(), second() {} +}; + +template +__device__ __host__ _Pair<_Key, _Value> _make_pair(const _Key& key, + const _Value& value) { + return _Pair<_Key, _Value>(key, value); +} + +template +using _Iterator = _Pair<_Key, _Value>*; + +namespace slab_hash { + +template +class SlabHashContext; + +template +class SlabHash { +public: + SlabHash(const uint32_t max_bucket_count, + const uint32_t max_keyvalue_count, + uint32_t device_idx); + + ~SlabHash(); + + _Value Size(); + _Value* SizePtr(); + /* Simplistic output: no iterators, and success mask is only provided + * for search. + * All the outputs are READ ONLY: change to these output will NOT change the + * internal hash table. + */ + void InsertAtomic(_Key* input_keys, uint32_t num_keys); + void Search(_Key* input_keys, + _Value* output_values, + uint8_t* output_masks, + uint32_t num_keys); + void Remove(_Key* input_keys, uint32_t num_keys); + + /* Verbose output (similar to std): return success masks for all operations, + + * and iterators for insert and search (not for remove operation, as + * iterators are invalid after erase). + * Output iterators supports READ/WRITE: change to these output will + * DIRECTLY change the internal hash table. + */ + void InsertAtomic_(_Key* input_keys, + _Iterator<_Key, _Value>* output_iterators, + uint8_t* output_masks, + uint32_t num_keys); + void Search_(_Key* input_keys, + _Iterator<_Key, _Value>* output_iterators, + uint8_t* output_masks, + uint32_t num_keys); + void Remove_(_Key* input_keys, uint8_t* output_masks, uint32_t num_keys); + + //////////////// + void BulkInsertWithMapping(const int* p_coords, + int* p_mapping, + int* p_inverse_mapping, + int size); + void IterateKeys(int* p_coords, int size); + void IterateSearchAtBatch(int* p_out, int batch_index, int size); + void IterateSearchPerBatch(const std::vector& p_outs, int size); + void IterateOffsetInsert(const std::shared_ptr>& in_map, + int* p_offset, int size); + //////////////// + + /* Debug usages */ + std::vector CountElemsPerBucket(); + + void CountElems(int* count); + + double ComputeLoadFactor(); + +private: + ptr_t* bucket_list_head_; + uint32_t num_buckets_; + + _Value* cnt_value_; + + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> gpu_context_; + + std::shared_ptr<_Alloc> allocator_; + SlabAlloc<_Alloc, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>* slab_list_allocator_; + + uint32_t device_idx_; + + uint32_t hash_coef_; +}; + +/** Lite version **/ +template +__global__ void InsertAtomicKernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* input_keys, + uint32_t num_keys, + uint32_t hash_coef); +template +__global__ void SearchKernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* input_keys, + _Value* output_values, + uint8_t* output_masks, + uint32_t num_keys); +template +__global__ void RemoveKernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* input_keys, + uint32_t num_keys); + +/** Verbose version **/ +template +__global__ void InsertAtomic_Kernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* input_keys, + _Iterator<_Key, _Value>* output_iterators, + uint8_t* output_masks, + uint32_t num_keys, + uint32_t hash_coef); +template +__global__ void Search_Kernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* input_keys, + _Iterator<_Key, _Value>* output_iterators, + uint8_t* output_masks, + uint32_t num_keys); +template +__global__ void Remove_Kernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Key* input_keys, + uint8_t* output_masks, + uint32_t num_keys); + +template +__global__ void GetIteratorsKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + _Iterator<_Key, _Value>* output_iterators, + uint32_t* output_iterator_count, + uint32_t num_buckets); +template +__global__ void CountElemsPerBucketKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + uint32_t* bucket_elem_counts, uint32_t hash_coef); + +template +__global__ void CountElemsKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + uint32_t* values, uint32_t* index); + +template +__global__ void InitKernel( + SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, + const uint32_t num_threads, + uint32_t hash_coef); + +template +__global__ void ReleaseKernel(const uint32_t num_threads); + +/////////////// +template +__global__ void BulkInsertWithMappingKernel(SlabHashContext<_Key, _Value, _Hash, + _LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_hash_ctx, +// _Key* input_keys, + const int* p_coords, + int* p_mapping, + int* p_inverse_mapping, + uint32_t num_keys, + uint32_t hash_coef); +/////////////// + +/** + * Internal implementation for the device proxy: + * DO NOT ENTER! + **/ +template +class SlabHashContext { +public: + using SlabAllocContextT = SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES>; +public: + SlabHashContext(); + __host__ void Setup( + ptr_t* bucket_list_head, + const uint32_t num_buckets, + _Value* cnt_value, + const SlabAllocContextT& allocator_ctx) { + bucket_list_head_ = bucket_list_head; + num_buckets_ = num_buckets; + cnt_value_ = cnt_value; + slab_list_allocator_ctx_ = allocator_ctx; + } + + /* Core SIMT operations, shared by both simplistic and verbose + * interfaces */ + __device__ _Pair InsertAtomic(uint8_t& lane_active, + const uint32_t lane_id, + const uint32_t bucket_id, + const _Key& key); + __device__ _Pair Search(uint8_t& lane_active, + const uint32_t lane_id, + const uint32_t bucket_id, + const _Key& key); + + __device__ uint8_t Remove(uint8_t& lane_active, + const uint32_t lane_id, + const uint32_t bucket_id, + const _Key& key); + + ///////////////// + __device__ void BulkInsertWithMapping(uint8_t& lane_active, + const uint32_t lane_id, + const uint32_t bucket_id, + const _Key& key, + int* p_mapping, + int* p_inverse_mapping, + int key_idx); + ///////////////// + + /* Hash function */ + __device__ __host__ uint32_t ComputeBucket(const _Key& key) const; + __device__ __host__ uint32_t bucket_size() const { return num_buckets_; } + + __device__ __host__ SlabAllocContextT& get_slab_alloc_ctx() { + return slab_list_allocator_ctx_; + } + + __device__ __forceinline__ ptr_t* get_unit_ptr_from_list_nodes( + const ptr_t slab_ptr, const uint32_t lane_id) { + return slab_list_allocator_ctx_.get_unit_ptr_from_slab(slab_ptr, + lane_id); + } + __device__ __forceinline__ ptr_t* get_slab_ptr_from_list_head( + const uint32_t bucket_id) { + return bucket_list_head_ + bucket_id; + } + __device__ __forceinline__ void ClearRemainPair(ptr_t* unit_data_ptr); + +private: + __device__ __forceinline__ void CopyRemainPair(ptr_t* unit_data_ptr, + const _Key& key, + const _Value& value); + __device__ __forceinline__ void WarpSyncKey(const _Key& key, + const uint32_t lane_id, + _Key& ret); + __device__ __forceinline__ int32_t WarpFindKey(const _Key& src_key, + const uint32_t lane_id, + const ptr_t* unit_data_ptr); + __device__ __forceinline__ int32_t WarpFindEmpty(const ptr_t* unit_data_ptr); + + __device__ __forceinline__ ptr_t AllocateSlab(const uint32_t lane_id); + __device__ __forceinline__ void FreeSlab(const ptr_t slab_ptr); + +private: + uint32_t num_buckets_; + _Hash hash_fn_; + + ptr_t* bucket_list_head_; + _Value* cnt_value_; + SlabAllocContext<_LOG_NUM_MEM_BLOCKS, + _LOG_NUM_SUPER_BLOCKS, + _MEM_UNIT_WARP_MULTIPLES> slab_list_allocator_ctx_; + +public: + static constexpr uint32_t key_chunks = sizeof(_Key) / sizeof(uint32_t); + static constexpr uint32_t value_chunks = sizeof(_Value) / sizeof(uint32_t); +}; + + +} // namespace slab_hash diff --git a/src/broadcast.cpp b/src/broadcast.cpp index a03160c1..04435f1c 100644 --- a/src/broadcast.cpp +++ b/src/broadcast.cpp @@ -88,11 +88,11 @@ at::Tensor BroadcastForwardGPU(at::Tensor in_feat, at::Tensor in_feat_glob, int op, py::object py_in_coords_key, py::object py_glob_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); // Both coords must exist // Use the global pooling mapping - const auto &in_out = p_coords_manager->getOriginInOutMapsGPU( + const InOutMapKey map_key = p_coords_manager->getOriginInOutMaps( py_in_coords_key, py_glob_coords_key); auto out_feat = @@ -104,7 +104,9 @@ at::Tensor BroadcastForwardGPU(at::Tensor in_feat, at::Tensor in_feat_glob, BroadcastForwardKernelGPU( in_feat.data(), in_feat.size(0), in_feat_glob.data(), in_feat_glob.size(0), out_feat.data(), in_feat.size(1), op, - in_out.first, in_out.second, handle, at::cuda::getCurrentCUDAStream()); + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + handle, at::cuda::getCurrentCUDAStream()); return out_feat; } @@ -116,13 +118,13 @@ void BroadcastBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat, py::object py_in_coords_key, py::object py_glob_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); const InOutMapKey map_key = p_coords_manager->getOriginMapHashKey( py_in_coords_key, py_glob_coords_key); - ASSERT(p_coords_manager->d_in_maps.find(map_key) != - p_coords_manager->d_in_maps.end(), + ASSERT(p_coords_manager->in_maps.find(map_key) != + p_coords_manager->in_maps.end(), "The in-out map doesn't exist for backward. Did you run forward pass?") grad_in_feat.resize_as_(in_feat); @@ -137,8 +139,8 @@ void BroadcastBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat, in_feat.data(), grad_in_feat.data(), in_feat.size(0), in_feat_glob.data(), grad_in_feat_glob.data(), in_feat_glob.size(0), grad_out_feat.data(), in_feat.size(1), op, - p_coords_manager->d_in_maps[map_key], - p_coords_manager->d_out_maps[map_key], handle, + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], handle, at::cuda::getCurrentCUDAStream()); } #endif @@ -166,23 +168,23 @@ template void BroadcastBackwardCPU( py::object py_coords_manager); #ifndef CPU_ONLY -template at::Tensor BroadcastForwardGPU( +template at::Tensor BroadcastForwardGPU( at::Tensor in_feat, at::Tensor in_feat_glob, int op, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template at::Tensor BroadcastForwardGPU( +template at::Tensor BroadcastForwardGPU( at::Tensor in_feat, at::Tensor in_feat_glob, int op, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void BroadcastBackwardGPU( +template void BroadcastBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor in_feat_glob, at::Tensor grad_in_feat_glob, at::Tensor grad_out_feat, int op, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void BroadcastBackwardGPU( +template void BroadcastBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor in_feat_glob, at::Tensor grad_in_feat_glob, at::Tensor grad_out_feat, int op, py::object py_in_coords_key, py::object py_out_coords_key, diff --git a/src/broadcast.cu b/src/broadcast.cu index b74dd136..4a9d89f7 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -80,13 +80,13 @@ template void BroadcastForwardKernelGPU( const Dtype *d_in_feat, int in_nrows, const Dtype *d_in_feat_global, int in_nrows_global, Dtype *d_out_feat, int nchannel, int op, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cusparseHandle_t cushandle, cudaStream_t stream) { // Sum all sizes int num_map = 0; for (const auto &in_map : in_maps) - num_map += in_map.size(); + num_map += in_map.size(0); if (num_map != in_nrows) throw std::invalid_argument("Invalid in_map"); @@ -100,14 +100,14 @@ void BroadcastForwardKernelGPU( case 0: // + channelwise_addition <<>>( - in_nrows, nchannel, d_in_feat_global, in_maps[0].data(), - out_maps[0].data(), d_out_feat); + in_nrows, nchannel, d_in_feat_global, in_maps[0].data(), + out_maps[0].data(), d_out_feat); break; case 1: // * channelwise_multiplication <<>>( - in_nrows, nchannel, d_in_feat_global, in_maps[0].data(), - out_maps[0].data(), d_out_feat); + in_nrows, nchannel, d_in_feat_global, in_maps[0].data(), + out_maps[0].data(), d_out_feat); break; default: throw std::invalid_argument(Formatter() << "Operation not supported: " @@ -121,13 +121,13 @@ void BroadcastForwardKernelGPU( template void BroadcastForwardKernelGPU( const float *d_in_feat, int in_nrows, const float *d_in_feat_global, int in_nrows_global, float *d_out_feat, int nchannel, int op, - const pInOutMaps &in_map, const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, cusparseHandle_t cuhandle, cudaStream_t stream); template void BroadcastForwardKernelGPU( const double *d_in_feat, int in_nrows, const double *d_in_feat_global, int in_nrows_global, double *d_out_feat, int nchannel, int op, - const pInOutMaps &in_map, const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, cusparseHandle_t cuhandle, cudaStream_t stream); template @@ -135,7 +135,7 @@ void BroadcastBackwardKernelGPU( const Dtype *d_in_feat, Dtype *d_grad_in_feat, int in_nrows, const Dtype *d_in_feat_global, Dtype *d_grad_in_feat_global, int in_nrows_global, const Dtype *d_grad_out_feat, int nchannel, int op, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cusparseHandle_t cushandle, cudaStream_t stream) { Itype *d_scr, *d_in_map, *d_out_map, *d_csr_row; Dtype *d_dtype, *d_csr_val, *d_tmp_grad_in_feat_global, *d_tmp_grad_in_feat; @@ -155,7 +155,7 @@ void BroadcastBackwardKernelGPU( // Sum all sizes int num_map = 0; for (const auto &in_map : in_maps) - num_map += in_map.size(); + num_map += in_map.size(0); if (num_map != in_nrows) throw std::invalid_argument("Invalid in_map"); @@ -175,12 +175,12 @@ void BroadcastBackwardKernelGPU( d_csr_row = d_scr + 2 * nnz; // in_nrows_global + 1 CUDA_CHECK(cudaMemcpy(d_in_map, - in_maps[0].data(), // in_maps are contiguous of size nnz + in_maps[0].data(), // in_maps are contiguous of size nnz nnz * sizeof(int), cudaMemcpyDeviceToDevice)); CUDA_CHECK( cudaMemcpy(d_out_map, - out_maps[0].data(), // out_maps are contiguous of size nnz + out_maps[0].data(), // out_maps are contiguous of size nnz nnz * sizeof(int), cudaMemcpyDeviceToDevice)); /* tmp in out feat */ @@ -311,14 +311,14 @@ template void BroadcastBackwardKernelGPU( const float *d_in_feat, float *d_grad_in_feat, int in_nrows, const float *d_in_feat_global, float *d_grad_in_feat_global, int in_nrows_global, const float *d_grad_out_feat, int nchannel, int op, - const pInOutMaps &in_map, const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, cusparseHandle_t cushandle, cudaStream_t stream); template void BroadcastBackwardKernelGPU( const double *d_in_feat, double *d_grad_in_feat, int in_nrows, const double *d_in_feat_global, double *d_grad_in_feat_global, int in_nrows_global, const double *d_grad_out_feat, int nchannel, int op, - const pInOutMaps &in_map, const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, cusparseHandle_t cushandle, cudaStream_t stream); } // namespace minkowski diff --git a/src/broadcast.cuh b/src/broadcast.cuh index 68817130..b28a2ec0 100644 --- a/src/broadcast.cuh +++ b/src/broadcast.cuh @@ -28,6 +28,7 @@ #include #include #include +#include #include "gpu.cuh" #include "gpu_memory_manager.hpp" @@ -41,8 +42,7 @@ void BroadcastForwardKernelGPU(const Dtype *d_in_feat, int in_nrows, const Dtype *d_in_feat_global, int in_nrows_global, Dtype *d_out_feat, int nchannel, int op, - const pInOutMaps &d_in_map, - const pInOutMaps &d_out_map, + const vector& in_maps, const vector& out_maps, cusparseHandle_t cushandle, cudaStream_t stream); template @@ -50,7 +50,7 @@ void BroadcastBackwardKernelGPU( const Dtype *d_in_feat, Dtype *d_grad_in_feat, int in_nrows, const Dtype *d_in_feat_global, Dtype *d_grad_in_feat_global, int in_nrows_global, const Dtype *d_grad_out_feat, int nchannel, int op, - const pInOutMaps &d_in_map, const pInOutMaps &d_out_map, + const vector& in_maps, const vector& out_maps, cusparseHandle_t cushandle, cudaStream_t stream); } // namespace minkowski diff --git a/src/common.hpp b/src/common.hpp index 4666f3d9..c3ea6343 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -32,6 +32,7 @@ #include #include "coords_manager.hpp" +#include "gpu_coords_manager.hpp" #include "types.hpp" #include "utils.hpp" diff --git a/src/convolution.cpp b/src/convolution.cpp index b2750f36..79c3ac6b 100644 --- a/src/convolution.cpp +++ b/src/convolution.cpp @@ -101,9 +101,9 @@ void ConvolutionForwardGPU(at::Tensor in_feat, at::Tensor out_feat, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); - const auto &in_out = p_coords_manager->getInOutMapsGPU( + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); + const InOutMapKey map_key = p_coords_manager->getInOutMaps( tensor_strides, strides, kernel_sizes, dilations, region_type, offsets, py_in_coords_key, py_out_coords_key, false); @@ -120,7 +120,10 @@ void ConvolutionForwardGPU(at::Tensor in_feat, at::Tensor out_feat, ConvolutionForwardKernelGPU( in_feat.template data(), in_feat.size(1), out_feat.template data(), out_feat.size(1), - kernel.template data(), in_out.first, in_out.second, out_nrows, + kernel.template data(), + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + out_nrows, handle, at::cuda::getCurrentCUDAStream()); } @@ -133,14 +136,14 @@ void ConvolutionBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); const InOutMapKey map_key = p_coords_manager->getMapHashKey( tensor_strides, strides, kernel_sizes, dilations, region_type, py_in_coords_key, py_out_coords_key, false, false); - ASSERT(p_coords_manager->d_in_maps.find(map_key) != - p_coords_manager->d_in_maps.end(), + ASSERT(p_coords_manager->in_maps.find(map_key) != + p_coords_manager->in_maps.end(), "The in-out map doesn't exist for backward. Did you run forward pass?") grad_in_feat.resize_as_(in_feat); @@ -155,8 +158,10 @@ void ConvolutionBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat, in_feat.template data(), grad_in_feat.template data(), in_feat.size(1), grad_out_feat.template data(), grad_out_feat.size(1), kernel.template data(), - grad_kernel.template data(), p_coords_manager->d_in_maps[map_key], - p_coords_manager->d_out_maps[map_key], grad_out_feat.size(0), handle, + grad_kernel.template data(), + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + grad_out_feat.size(0), handle, at::cuda::getCurrentCUDAStream()); } #endif // end CPU_ONLY @@ -190,27 +195,27 @@ template void ConvolutionBackwardCPU( py::object py_coords_manager); #ifndef CPU_ONLY -template void ConvolutionBackwardGPU( +template void ConvolutionBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor kernel, at::Tensor grad_kernel, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void ConvolutionBackwardGPU( +template void ConvolutionBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor kernel, at::Tensor grad_kernel, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void ConvolutionForwardGPU( +template void ConvolutionForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor kernel, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void ConvolutionForwardGPU( +template void ConvolutionForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor kernel, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, diff --git a/src/convolution.cu b/src/convolution.cu index daf3edfe..2e5ed7bb 100644 --- a/src/convolution.cu +++ b/src/convolution.cu @@ -36,6 +36,10 @@ * Matrix multiplication (CUDA Kernel) on the device: C = A * B * wA is A's width and wB is B's width */ +///////////// +// TODO(ljm): fix offset logic BUG, though MAX_GRID is large enough +// +///////////// template __global__ void matmul(const Dtype *A, const int wA, const int hA, const Dtype *B, const int wB, const int hB, Dtype *C, @@ -213,8 +217,7 @@ template void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel, Dtype *d_out_feat, int out_nchannel, const Dtype *d_kernel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, int out_nrows, cublasHandle_t cuhandle, cudaStream_t stream) { @@ -240,7 +243,7 @@ void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel, // Iterate through each spatial kernel and get indices for in_map and out_map for (int k = 0; k < in_maps.size(); k++) { - n_active_in_volume = in_maps[k].size(); + n_active_in_volume = in_maps[k].size(0); if (n_active_in_volume == 0) continue; @@ -258,25 +261,25 @@ void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel, matmul<<>>( d_in_feat, in_nchannel, curr_num_active, &d_kernel[k * in_nchannel * out_nchannel], out_nchannel, - in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data()); + in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data()); break; case 24: matmul<<>>( d_in_feat, in_nchannel, curr_num_active, &d_kernel[k * in_nchannel * out_nchannel], out_nchannel, - in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data()); + in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data()); break; case 16: matmul<<>>( d_in_feat, in_nchannel, curr_num_active, &d_kernel[k * in_nchannel * out_nchannel], out_nchannel, - in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data()); + in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data()); break; case 8: matmul<<>>( d_in_feat, in_nchannel, curr_num_active, &d_kernel[k * in_nchannel * out_nchannel], out_nchannel, - in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data()); + in_nchannel, d_out_feat, in_maps[k].data(), out_maps[k].data()); break; } } @@ -287,14 +290,16 @@ void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel, template void ConvolutionForwardKernelGPU( const float *d_in_feat, int in_nchannel, float *d_out_feat, - int out_nchannel, const float *d_kernel, const pInOutMaps &in_map, - const pInOutMaps &out_map, int out_nrows, cublasHandle_t cuhandle, + int out_nchannel, const float *d_kernel, + const vector& in_maps, const vector& out_maps, + int out_nrows, cublasHandle_t cuhandle, cudaStream_t stream); template void ConvolutionForwardKernelGPU( const double *d_in_feat, int in_nchannel, double *d_out_feat, - int out_nchannel, const double *d_kernel, const pInOutMaps &in_map, - const pInOutMaps &out_map, int out_nrows, cublasHandle_t cuhandle, + int out_nchannel, const double *d_kernel, + const vector& in_maps, const vector& out_maps, + int out_nrows, cublasHandle_t cuhandle, cudaStream_t stream); template @@ -302,8 +307,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat, int in_nchannel, const Dtype *d_grad_out_feat, int out_nchannel, const Dtype *d_kernel, Dtype *d_grad_kernel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, int out_nrows, cublasHandle_t cuhandle, cudaStream_t stream) { @@ -328,7 +332,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat, dim3 threads(shared_mem_size, shared_mem_size); for (int k = 0; k < in_maps.size(); k++) { - n_active_in_volume = in_maps[k].size(); + n_active_in_volume = in_maps[k].size(0); if (n_active_in_volume == 0) continue; @@ -350,7 +354,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat, d_in_feat, in_nchannel, curr_num_active, // D d_grad_in_feat, // C &d_grad_kernel[k * in_nchannel * out_nchannel], // E - in_maps[k].data(), out_maps[k].data()); + in_maps[k].data(), out_maps[k].data()); break; case 24: matmul2<<>>( @@ -360,7 +364,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat, d_in_feat, in_nchannel, curr_num_active, // D d_grad_in_feat, // C &d_grad_kernel[k * in_nchannel * out_nchannel], // E - in_maps[k].data(), out_maps[k].data()); + in_maps[k].data(), out_maps[k].data()); break; case 16: matmul2<<>>( @@ -370,7 +374,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat, d_in_feat, in_nchannel, curr_num_active, // D d_grad_in_feat, // C &d_grad_kernel[k * in_nchannel * out_nchannel], // E - in_maps[k].data(), out_maps[k].data()); + in_maps[k].data(), out_maps[k].data()); break; case 8: matmul2<<>>( @@ -380,7 +384,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat, d_in_feat, in_nchannel, curr_num_active, // D d_grad_in_feat, // C &d_grad_kernel[k * in_nchannel * out_nchannel], // E - in_maps[k].data(), out_maps[k].data()); + in_maps[k].data(), out_maps[k].data()); break; } } @@ -392,15 +396,17 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat, template void ConvolutionBackwardKernelGPU( const float *d_in_feat, float *d_grad_in_feat, int in_nchannel, const float *d_grad_out_feat, int out_nchannel, const float *d_kernel, - float *p_grad_kernel, const pInOutMaps &in_map, - const pInOutMaps &out_map, int out_nrows, cublasHandle_t cuhandle, + float *p_grad_kernel, + const vector& in_maps, const vector& out_maps, + int out_nrows, cublasHandle_t cuhandle, cudaStream_t stream); template void ConvolutionBackwardKernelGPU( const double *d_in_feat, double *d_grad_in_feat, int in_nchannel, const double *d_grad_out_feat, int out_nchannel, const double *d_kernel, - double *p_grad_kernel, const pInOutMaps &in_map, - const pInOutMaps &out_map, int out_nrows, cublasHandle_t cuhandle, + double *p_grad_kernel, + const vector& in_maps, const vector& out_maps, + int out_nrows, cublasHandle_t cuhandle, cudaStream_t stream); } // end namespace minkowski diff --git a/src/convolution.cuh b/src/convolution.cuh index c29b892c..3da509b1 100644 --- a/src/convolution.cuh +++ b/src/convolution.cuh @@ -27,6 +27,7 @@ #include #include +#include #include "gpu.cuh" #include "math_functions.hpp" @@ -48,8 +49,7 @@ template void ConvolutionForwardKernelGPU(const Dtype *d_in_feat, int in_nchannel, Dtype *d_out_feat, int out_nchannel, const Dtype *d_kernel, - const pInOutMaps &in_map, - const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, int out_nrows, cublasHandle_t cuhandle, cudaStream_t stream); @@ -58,8 +58,7 @@ void ConvolutionBackwardKernelGPU(const Dtype *d_in_feat, Dtype *d_grad_in_feat, int in_nchannel, const Dtype *d_grad_out_feat, int out_nchannel, const Dtype *d_kernel, Dtype *d_grad_kernel, - const pInOutMaps &in_map, - const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, int out_nrows, cublasHandle_t cuhandle, cudaStream_t stream); } //end namespace minkowski diff --git a/src/convolution_transpose.cpp b/src/convolution_transpose.cpp index 627a3178..d175ff2a 100644 --- a/src/convolution_transpose.cpp +++ b/src/convolution_transpose.cpp @@ -122,9 +122,9 @@ void ConvolutionTransposeForwardGPU( vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool generate_new_coords) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); - const auto &in_out = p_coords_manager->getInOutMapsGPU( + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); + const InOutMapKey map_key = p_coords_manager->getInOutMaps( tensor_strides, strides, kernel_sizes, dilations, region_type, offsets, py_in_coords_key, py_out_coords_key, true, false, generate_new_coords); @@ -141,7 +141,10 @@ void ConvolutionTransposeForwardGPU( ConvolutionForwardKernelGPU( in_feat.template data(), in_feat.size(1), out_feat.template data(), out_feat.size(1), - kernel.template data(), in_out.first, in_out.second, out_nrows, + kernel.template data(), + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + out_nrows, handle, at::cuda::getCurrentCUDAStream()); } @@ -152,8 +155,8 @@ void ConvolutionTransposeBackwardGPU( vector strides, vector kernel_sizes, vector dilations, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); bool reverse_map = false; const InOutMapKey rev_map_key = p_coords_manager->getMapHashKey( tensor_strides, strides, kernel_sizes, dilations, region_type, @@ -177,8 +180,8 @@ void ConvolutionTransposeBackwardGPU( if (!reverse_map) { ASSERT( - p_coords_manager->d_in_maps.find(map_key) != - p_coords_manager->d_in_maps.end(), + p_coords_manager->in_maps.find(map_key) != + p_coords_manager->in_maps.end(), "The in-out map doesn't exist for backward. Did you run forward pass?"); ConvolutionBackwardKernelGPU( @@ -186,13 +189,13 @@ void ConvolutionTransposeBackwardGPU( in_feat.size(1), grad_out_feat.template data(), grad_out_feat.size(1), kernel.template data(), grad_kernel.template data(), - p_coords_manager->d_in_maps[map_key], - p_coords_manager->d_out_maps[map_key], grad_out_feat.size(0), handle, + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], grad_out_feat.size(0), handle, at::cuda::getCurrentCUDAStream()); } else { ASSERT( - p_coords_manager->d_in_maps.find(rev_map_key) != - p_coords_manager->d_in_maps.end(), + p_coords_manager->in_maps.find(rev_map_key) != + p_coords_manager->in_maps.end(), "The in-out map doesn't exist for backward. Did you run forward pass?"); ConvolutionBackwardKernelGPU( @@ -200,8 +203,8 @@ void ConvolutionTransposeBackwardGPU( in_feat.size(1), grad_out_feat.template data(), grad_out_feat.size(1), kernel.template data(), grad_kernel.template data(), - p_coords_manager->d_out_maps[rev_map_key], - p_coords_manager->d_in_maps[rev_map_key], grad_out_feat.size(0), handle, + p_coords_manager->out_maps[rev_map_key], + p_coords_manager->in_maps[rev_map_key], grad_out_feat.size(0), handle, at::cuda::getCurrentCUDAStream()); } } @@ -236,28 +239,28 @@ template void ConvolutionTransposeBackwardCPU( py::object py_coords_manager); #ifndef CPU_ONLY -template void ConvolutionTransposeForwardGPU( +template void ConvolutionTransposeForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor kernel, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool generate_new_coords); -template void ConvolutionTransposeForwardGPU( +template void ConvolutionTransposeForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor kernel, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool generate_new_coords); -template void ConvolutionTransposeBackwardGPU( +template void ConvolutionTransposeBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor kernel, at::Tensor grad_kernel, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void ConvolutionTransposeBackwardGPU( +template void ConvolutionTransposeBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor kernel, at::Tensor grad_kernel, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, diff --git a/src/coords_manager.cu b/src/coords_manager.cu.deprecated similarity index 100% rename from src/coords_manager.cu rename to src/coords_manager.cu.deprecated diff --git a/src/coords_manager.hpp b/src/coords_manager.hpp index e2ca7ba7..962664d9 100644 --- a/src/coords_manager.hpp +++ b/src/coords_manager.hpp @@ -42,11 +42,6 @@ #include "types.hpp" #include "utils.hpp" -#ifndef CPU_ONLY -#include "gpu_memory_manager.hpp" -#include -#endif // CPU_ONLY - namespace minkowski { using std::begin; @@ -127,9 +122,6 @@ template class CoordsManager { omp_set_dynamic(0); omp_set_num_threads(num_threads); } -#ifndef CPU_ONLY - gpu_memory_manager = std::make_shared(backend); -#endif } CoordsManager(int num_threads): CoordsManager(num_threads, PYTORCH) {} CoordsManager(): CoordsManager(-1, PYTORCH) {} @@ -154,14 +146,6 @@ template class CoordsManager { vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, bool is_transpose, bool is_pool); -#ifndef CPU_ONLY - vector> - getKernelMapGPU(vector tensor_strides, vector strides, - vector kernel_sizes, vector dilations, - int region_type, at::Tensor offsets, - py::object py_in_coords_key, py::object py_out_coords_key, - bool is_transpose, bool is_pool); -#endif // TODO make this function non-const with ability to generate a new map vector getCoordsMap(py::object py_in_coords_key, py::object py_out_coords_key) const; @@ -268,45 +252,6 @@ template class CoordsManager { vector getRowIndicesPerBatch(py::object py_in_coords_key, py::object py_out_coords_key); -#ifndef CPU_ONLY - // GPU memory manager - std::shared_ptr gpu_memory_manager; - - // Keep all in out maps throughout the lifecycle of the coords manager - // - unordered_map, InOutMapKeyHash> d_in_maps; - unordered_map, InOutMapKeyHash> d_out_maps; - - const pInOutMaps copyInOutMapToGPU(const InOutMaps &map); - void copyInOutMapsToGPU(const InOutMapKey &map_key); - - const pInOutMapsRefPair - getInOutMapsGPU(const vector &tensor_strides, const vector &strides, - const vector &kernel_sizes, const vector &dilations, - int region_type, const at::Tensor &offsets, - py::object py_in_coords_key, py::object py_out_coords_key, - bool is_transpose, bool is_pool = false, - bool force_creation = false); - - const pInOutMapsRefPair - getOriginInOutMapsGPU(py::object py_in_coords_key, - py::object py_out_coords_key); - - const pInOutMapsRefPair - getPruningInOutMapsGPU(at::Tensor use_feat, py::object py_in_coords_key, - py::object py_out_coords_key); - - const pInOutMapsRefPair - getUnionInOutMapsGPU(vector py_in_coords_keys, - py::object py_out_coords_key); - - void *getScratchGPUMemory(size_t size) { - return gpu_memory_manager.get()->tmp_data(size); - } - - void clearScratchGPUMemory() { gpu_memory_manager.get()->clear_tmp(); } - -#endif // CPU_ONLY }; // coordsmanager } // namespace minkowski diff --git a/src/gpu_coords_manager.cpp b/src/gpu_coords_manager.cpp new file mode 100644 index 00000000..6e960225 --- /dev/null +++ b/src/gpu_coords_manager.cpp @@ -0,0 +1,1527 @@ +/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu). + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural + * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part + * of the code. + */ +#include "common.hpp" +#include "region.hpp" +#include "utils.hpp" + +#include + +namespace py = pybind11; + +namespace minkowski { + +/* + * Given tensor_stride_src and tensor_stride_dst, find the respective coord_maps + * and return the indices of the coord_map_ind in coord_map_dst + */ +template +vector> GPUCoordsManager::getKernelMap( + const vector& tensor_strides, const vector& strides, + const vector& kernel_sizes, + const vector& dilations, int region_type, at::Tensor offsets, + py::object py_in_coords_key, py::object py_out_coords_key, + bool is_transpose, bool is_pool) { + // WARNING: This function will not work properly with custon region types. + ASSERT(region_type != 2, + "Currently, it does not support the custom region type."); + /* + const InOutMapKey map_key = getMapHashKey( + tensor_strides, strides, kernel_sizes, dilations, region_type, + py_in_coords_key, py_out_coords_key, is_transpose, is_pool); + + const auto &in_map_iter = in_maps.find(map_key); + */ + +// if (in_map_iter == in_maps.end()) { + const InOutMapKey map_key = getInOutMaps(tensor_strides, strides, kernel_sizes, dilations, region_type, + offsets, py_in_coords_key, py_out_coords_key, false); +// ASSERT(in_maps.find(map_key) != in_maps.end(), "Kernel map not found."); +// } + + return {in_maps[map_key], out_maps[map_key]}; +} + +template +vector +GPUCoordsManager::getCoordsMap(py::object py_in_coords_key, + py::object py_out_coords_key) const { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + const uint64_t in_coords_key = p_in_coords_key->getKey(); + const uint64_t out_coords_key = p_out_coords_key->getKey(); + + const auto in_map_iter = coords_maps.find(in_coords_key); + const auto out_map_iter = coords_maps.find(out_coords_key); + + ASSERT(in_map_iter != coords_maps.end(), "Input coords not found at", + to_string(in_coords_key)); + ASSERT(out_map_iter != coords_maps.end(), "Output coords not found at", + to_string(out_coords_key)); + + const auto &out_tensor_strides = p_out_coords_key->getTensorStride(); + + const auto nrows = in_map_iter->second->nrows; + + at::Tensor in = + torch::empty({static_cast(nrows + 1)}, + torch::TensorOptions().dtype(torch::kInt32)); + at::Tensor out = + torch::empty({static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32)); + + int* p_in = in.data(); + int* p_out = out.data(); + + out_map_iter->second->stride_search(in_map_iter->second, + p_in, p_out, + out_tensor_strides, + nrows); + int size = *(p_in + nrows); + in.resize_({size}); + out.resize_({size}); + return {in, out}; +} + +template +uint64_t +GPUCoordsManager::getCoordsKey(const vector &tensor_strides) const { + auto tensor_stride_hash = hash_vec>(tensor_strides); + ASSERT(coords_maps.find(tensor_stride_hash) != coords_maps.end(), + "The coord map doesn't exist for the given tensor strides ", + "tensor_stride: ", ArrToString(tensor_strides)); + return tensor_stride_hash; +} + +template +bool GPUCoordsManager::existsCoordsKey(const uint64_t coords_key) const { + return coords_maps.find(coords_key) != coords_maps.end(); +} + +template +bool GPUCoordsManager::existsCoordsKey(py::object py_coords_key) const { + CoordsKey *p_coords_key = py_coords_key.cast(); + return existsCoordsKey(p_coords_key->getKey()); +} + +template +uint64_t GPUCoordsManager::getRandomCoordsKey() { + uint64_t coords_key = random(); + while (coords_maps.find(coords_key) != coords_maps.end()) + coords_key = random(); + return coords_key; +} + +template +int GPUCoordsManager::getCoordsSize(const uint64_t coords_key) const { + const auto &coords_map_iter = coords_maps.find(coords_key); + ASSERT(coords_map_iter != coords_maps.end(), + "The coord map doesn't exist for the given coords_key: ", + to_string(coords_key), "."); + return coords_map_iter->second->size(); +} + +template +int GPUCoordsManager::getCoordsSize(py::object py_coords_key) const { + CoordsKey *p_coords_key = py_coords_key.cast(); + return getCoordsSize(p_coords_key->getKey()); +} + +template +void GPUCoordsManager::getCoords(at::Tensor coords, + py::object py_coords_key) const { + CoordsKey *p_coords_key = py_coords_key.cast(); + const uint64_t coords_key = p_coords_key->getKey(); + + // initialize + const auto &coords_map_iter = coords_maps.find(coords_key); + ASSERT(coords_map_iter != coords_maps.end(), + "The coord map doesn't exist for the given coords_key: ", + to_string(coords_key), "."); + //const GPUCoordsMap &coordmap = coords_map_iter->second; +// const auto& coordmap = coords_map_iter->second; + /* + int nrows = coordmap->nrows; + int ncols = coordmap->ncols; + */ + int nrows = coords_map_iter->second->nrows; + int ncols = coords_map_iter->second->ncols; + coords.resize_({nrows, ncols}); + int *p_coords = coords.data(); + + //coordmap->get_coords(p_coords, nrows); + coords_map_iter->second->get_coords(p_coords, nrows); +} + +template +void GPUCoordsManager::setOriginCoordsKey(py::object py_coords_key) { + CoordsKey *p_coords_key = py_coords_key.cast(); + const int D = p_coords_key->getDimension(); + ASSERT(D > 0, "Invalid dimension: ", D); + if (!p_coords_key->key_set) { + p_coords_key->setKey(createOriginCoords(D)); + const vector zero_vec(D, 0); + p_coords_key->setTensorStride(zero_vec); + } else { + auto coords_key = p_coords_key->getKey(); + auto origin_key = createOriginCoords(D); + ASSERT(coords_key == origin_key, "Invalid key: ", to_string(coords_key), + " != Origin key: ", to_string(origin_key)); + } +} + +/******************************* + * Initialization + *******************************/ + +/* + * coords: coordinates in IntTensor + * mapping: output mapping in IntTensor + * tensor_strides: current tensor strides this coords will be initializeds + * force_creation: even when there's a duplicate coords with the same tensor + * strides. + * force_remap: if there's duplicate coords, remap + * allow_duplicate_coords: create map when there are duplicates in the + * coordinates + */ +template +uint64_t GPUCoordsManager::initializeCoords( + at::Tensor coords, at::Tensor mapping, at::Tensor inverse_mapping, + const vector &tensor_strides, const bool force_creation, + const bool force_remap, const bool allow_duplicate_coords, + const bool return_inverse) { + device = coords.device(); + const int nrows = coords.size(0); + const int ncols = coords.size(1); + const int D = ncols - 1; + + // Basic assertions + ASSERT(force_creation == true, "force_creation must be true"); + ASSERT(D == tensor_strides.size(), "The coordinate dimension (ncols - 1) ", + to_string(D), + " must match the size of tensor stride: ", ArrToString(tensor_strides), + "."); + + uint64_t key = hash_vec(tensor_strides); + + if (coords_maps.find(key) != coords_maps.end()) { + // If force creation, set a random key that doesn't exist + if (force_creation) { + key = getRandomCoordsKey(); + } else { + ASSERT(false, "The coord map already exists for the given tensor stride ", + "tensor_stride: ", ArrToString(tensor_strides), + "For more information, please refer to the SparseTensor creation " + "documentation available at:" + "https://nvidia.github.io/MinkowskiEngine/sparse_tensor.html"); + } + } + + // Create the concurrent coords map + mapping.resize_(static_cast(nrows)).to(device); + inverse_mapping.resize_(static_cast(nrows)).to(device); + int* p_coords = coords.data(); + int* p_mapping = mapping.data(); + int* p_inverse_mapping = inverse_mapping.data(); + float duplicate_factor = 0.1; + coords_maps[key] = std::make_shared>(nrows, duplicate_factor); + + ASSERT(force_remap == true, + "Please use cpu version when force_remap == false"); + + auto coords_map_size = coords_maps[key]->initialize_batch( + p_coords, p_mapping, p_inverse_mapping, + nrows, ncols, force_remap, return_inverse); + + min_nrows = coords_map_size; + min_coords_key = key; + + if (!allow_duplicate_coords && !force_remap) { + ASSERT(nrows == coords_map_size, "Duplicate coordinates found. ", + "Number of input coords:", nrows, + " != Number of unique coords:", coords_map_size, + "If the duplication was intentional, set force_remap to true." + "For more information, please refer to the SparseTensor creation " + "documentation available at: " + "https://nvidia.github.io/MinkowskiEngine/sparse_tensor.html"); + } + + // When remapping, return the mapping to pytorch. + if (force_remap || return_inverse) { +// ASSERT(mapping.dtype() == torch::kInt64, +// "Mapping must be a torch::LongTensor"); + mapping.resize_({coords_map_size}); + } + + if (return_inverse) { +// ASSERT(inverse_mapping.dtype() == torch::kInt64, +// "Inverse Mapping must be a torch::LongTensor"); + ASSERT(inverse_mapping.size(0) == nrows, + "inverse_mapping's size must equal to nrows"); + } + + return key; +} + +template +uint64_t GPUCoordsManager::initializeCoords( + at::Tensor coords, at::Tensor mapping, at::Tensor inverse_mapping, + py::object py_coords_key, const bool force_creation, const bool force_remap, + const bool allow_duplicate_coords, const bool return_inverse) { + CoordsKey *p_coords_key = py_coords_key.cast(); + + const uint64_t in_coords_key = initializeCoords( + coords, mapping, inverse_mapping, p_coords_key->getTensorStride(), + force_creation, force_remap, allow_duplicate_coords, return_inverse); + + // Tensor strides initialized on the python side. + p_coords_key->setKey(in_coords_key); + + return in_coords_key; +} + +/*********************************/ +template +uint64_t GPUCoordsManager::createStridedCoords( + uint64_t coords_key, const vector &tensor_strides, + const vector &strides, bool force_creation) { + // Basic assertions + ASSERT(existsCoordsKey(coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(coords_key), "."); + + const vector out_tensor_strides = + computeOutTensorStride(tensor_strides, strides, false); + + const int D = coords_maps[coords_key]->ncols - 1; + ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ", + "GPUCoordsManager dimension: ", to_string(D), + ", tensor_strides dimension: ", to_string(tensor_strides.size())); + + uint64_t out_coords_key = 0; + const bool is_identity = + std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; }); + + if (is_identity) { + out_coords_key = coords_key; + } else { + + // tensor_strides.size() == strides.size() on computeOutTensorStride + out_coords_key = hash_vec(out_tensor_strides); + + // If force creationg, get a random key. + // ElseIf the coordinates already exists, return the key. + if (force_creation) { + if (existsCoordsKey(out_coords_key)) + out_coords_key = getRandomCoordsKey(); + } else if (existsCoordsKey(out_coords_key)) { + return out_coords_key; + } + + // Create a strided coords map + int duplicate_factor = 1; + for (auto stride : strides) duplicate_factor *= stride; + duplicate_factor = 1.0 / duplicate_factor; + const auto nrows = coords_maps[coords_key]->nrows; + coords_maps[out_coords_key] = std::make_shared>(nrows, duplicate_factor); + auto out_nrows = coords_maps[out_coords_key]->stride_insert(coords_maps[coords_key], + out_tensor_strides, + nrows); + if (out_nrows < min_nrows) { + min_nrows = out_nrows; + min_coords_key = out_coords_key; + } + } + + return out_coords_key; +} + +template +const InOutMapKey +GPUCoordsManager::getStridedInOutMaps( + py::object py_in_coords_key, py::object py_out_coords_key, + const vector& tensor_strides, const vector& strides, + const vector& kernel_sizes, const vector& dilations, int region_type, + bool is_transpose, bool is_pool, + bool force_creation) { + + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + const uint64_t in_coords_key = p_in_coords_key->getKey(); + uint64_t out_coords_key = 0; + + /* + if (!p_out_coords_key->tensor_stride_set) { + p_out_coords_key->setTensorStride(tensor_strides); + p_out_coords_key->up_stride(strides); + } + */ + + // Basic assertions + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + + const int D = coords_maps[in_coords_key]->ncols - 1; + ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ", + "GPUCoordsManager dimension: ", to_string(D), + ", tensor_strides dimension: ", to_string(tensor_strides.size())); + + const vector out_tensor_strides = + computeOutTensorStride(tensor_strides, strides, is_transpose); + + const bool is_identity = + std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; }); + + if (is_identity) { + ASSERT(!p_out_coords_key->isKeySet() || + p_out_coords_key->getKey() == p_in_coords_key->getKey(), + "Be aware of coords_key overwrite leakage"); + out_coords_key = in_coords_key; + p_out_coords_key->setKey(out_coords_key); + if (!p_out_coords_key->tensor_stride_set) { + p_out_coords_key->setTensorStride(tensor_strides); + p_out_coords_key->up_stride(strides); + } + } else if (force_creation) { + return createStridedInOutMaps( + py_in_coords_key, py_out_coords_key, + tensor_strides, strides, + kernel_sizes, dilations, region_type, + is_transpose, is_pool, + true); + } else if (p_out_coords_key->isKeySet()) { + out_coords_key = p_out_coords_key->getKey(); + } else { + out_coords_key = hash_vec(out_tensor_strides); + if (!existsCoordsKey(out_coords_key)) { + return createStridedInOutMaps( + py_in_coords_key, py_out_coords_key, + tensor_strides, strides, + kernel_sizes, dilations, region_type, + is_transpose, is_pool, + false); + } + } + + const InOutMapKey map_key = getMapHashKey( + tensor_strides, strides, kernel_sizes, dilations, region_type, + py_in_coords_key, py_out_coords_key, is_transpose, is_pool); + + if (in_maps.find(map_key) != in_maps.end()) return map_key; + + const auto nrows = coords_maps[in_coords_key]->nrows; + + vector th_ins(1, + torch::empty({static_cast(nrows + 1)}, + torch::TensorOptions().dtype(torch::kInt32))); + vector th_outs(1, + torch::empty({static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32))); + + int* p_in = th_ins[0].data(); + int* p_out = th_outs[0].data(); + + coords_maps[out_coords_key]->stride_search(coords_maps[in_coords_key], + p_in, p_out, + out_tensor_strides, + nrows); + int size = *(p_in + nrows); + th_ins[0].resize_({size}); + th_outs[0].resize_({size}); + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + return map_key; +} + +template +const InOutMapKey +GPUCoordsManager::createStridedInOutMaps( + py::object py_in_coords_key, py::object py_out_coords_key, + const vector &tensor_strides, + const vector &strides, + vector kernel_sizes, vector dilations, int region_type, + bool is_transpose, bool is_pool, + bool force_creation) { + + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + const uint64_t in_coords_key = p_in_coords_key->getKey(); + uint64_t out_coords_key = 0; + + // Basic assertions + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + + const int D = coords_maps[in_coords_key]->ncols - 1; + ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ", + "GPUCoordsManager dimension: ", to_string(D), + ", tensor_strides dimension: ", to_string(tensor_strides.size())); + + const bool is_identity = + std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; }); + + ASSERT(is_identity == false, + "Please check is_identity in getStridedInOutMaps"); + + const vector out_tensor_strides = + computeOutTensorStride(tensor_strides, strides, is_transpose); + + out_coords_key = hash_vec(out_tensor_strides); + if (force_creation) { + if (existsCoordsKey(out_coords_key)) + out_coords_key = getRandomCoordsKey(); + } else { + ASSERT(!existsCoordsKey(out_coords_key), + "createX will always come from getX, getX has handled this condition"); + } + + p_out_coords_key->setKey(out_coords_key); + + if (!p_out_coords_key->tensor_stride_set) { + p_out_coords_key->setTensorStride(tensor_strides); + p_out_coords_key->up_stride(strides); + } + + const InOutMapKey map_key = getMapHashKey( + tensor_strides, strides, kernel_sizes, dilations, region_type, + py_in_coords_key, py_out_coords_key, is_transpose, is_pool); + +// if (in_maps.find(map_key) != in_maps.end()) return; + ASSERT(in_maps.find(map_key) == in_maps.end(), + "out_coords_key is new, ins/outs maps have to be generated."); + + const auto nrows = coords_maps[in_coords_key]->nrows; + + vector th_ins(1, + torch::empty({static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32))); + vector th_outs(1, + torch::empty({static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32))); + + int* p_in = th_ins[0].data(); + int* p_out = th_outs[0].data(); + + // Create a strided coords map + int duplicate_factor = 1; + for (auto stride : strides) duplicate_factor *= stride; + duplicate_factor = 1.0 / duplicate_factor; + coords_maps[out_coords_key] = std::make_shared>(nrows, duplicate_factor); + auto out_nrows = coords_maps[out_coords_key]->stride_insert_search(coords_maps[in_coords_key], + p_in, p_out, + out_tensor_strides, + nrows); + if (out_nrows < min_nrows) { + min_nrows = out_nrows; + min_coords_key = out_coords_key; + } + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + return map_key; +} + +template +const InOutMapKey +GPUCoordsManager::getTransposedStridedRegionInOutMaps( + py::object py_in_coords_key, py::object py_out_coords_key, + const vector& tensor_strides, + const vector& strides, const vector& kernel_sizes, const vector& dilations, + int region_type, + bool is_transpose, bool is_pool, + at::Tensor offsets, + bool force_creation) { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + const uint64_t in_coords_key = p_in_coords_key->getKey(); + uint64_t out_coords_key = 0; + + // Basic assertions + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + + const int D = coords_maps[in_coords_key]->ncols - 1; + ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ", + "GPUCoordsManager dimension: ", to_string(D), + ", tensor_strides dimension: ", to_string(tensor_strides.size())); + + const vector out_tensor_strides = + computeOutTensorStride(tensor_strides, strides, is_transpose); + + const bool is_identity = + std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; }); + + if (is_identity) { + ASSERT(!p_out_coords_key->isKeySet() || + p_out_coords_key->getKey() == p_in_coords_key->getKey(), + "Be aware of coords_key overwrite leakage"); + out_coords_key = in_coords_key; + p_out_coords_key->setKey(out_coords_key); + if (!p_out_coords_key->tensor_stride_set) { + p_out_coords_key->setTensorStride(tensor_strides); + p_out_coords_key->up_stride(strides); + } + } else if (force_creation) { + return createTransposedStridedRegionInOutMaps( + py_in_coords_key, py_out_coords_key, + tensor_strides, strides, + kernel_sizes, dilations, region_type, + is_transpose, is_pool, + offsets, + true); + } else if (p_out_coords_key->isKeySet()) { + out_coords_key = p_out_coords_key->getKey(); + } else { + out_coords_key = hash_vec(out_tensor_strides); + if (!existsCoordsKey(out_coords_key)) { + return createTransposedStridedRegionInOutMaps( + py_in_coords_key, py_out_coords_key, + tensor_strides, strides, + kernel_sizes, dilations, region_type, + is_transpose, is_pool, + offsets, + false); + } + } + + const InOutMapKey map_key = getMapHashKey( + tensor_strides, strides, kernel_sizes, dilations, region_type, + py_in_coords_key, py_out_coords_key, is_transpose, is_pool); + + if (in_maps.find(map_key) != in_maps.end()) return map_key; + + const auto nrows = coords_maps[in_coords_key]->nrows; + + // Create transposed coords map + Region region = Region(out_tensor_strides, kernel_sizes, dilations, + region_type, offsets.data(), offsets.size(0)); + +// in_maps[map_key] = vector(region.size(), + vector th_ins(region.size(), + torch::empty({static_cast(nrows + 1)}, + torch::TensorOptions().dtype(torch::kInt32))); +// out_maps[map_key] = vector(region.size(), + vector th_outs(region.size(), + torch::empty({static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32))); + + vector p_ins(region.size()); + vector p_outs(region.size()); + for (size_t c = 0; c != region.size(); ++c) { + p_ins[c] = th_ins[c].data(); + p_outs[c] = th_outs[c].data(); + } + + coords_maps[out_coords_key]->region_search(coords_maps[in_coords_key], + p_ins, p_outs, + region, nrows); + for (size_t c = 0; c != region.size(); ++c) { + int size = *(p_ins[c] + nrows); + th_ins[c].resize_({size}); + th_outs[c].resize_({size}); + } + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + return map_key; +} + +template +const InOutMapKey +GPUCoordsManager::createTransposedStridedRegionInOutMaps( + py::object py_in_coords_key, py::object py_out_coords_key, + const vector& tensor_strides, + const vector& strides, const vector& kernel_sizes, const vector& dilations, + int region_type, + bool is_transpose, bool is_pool, + at::Tensor offsets, bool force_creation) { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + const uint64_t in_coords_key = p_in_coords_key->getKey(); + uint64_t out_coords_key = 0; + + // Basic assertions + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + + const int D = coords_maps[in_coords_key]->ncols - 1; + ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ", + "GPUCoordsManager dimension: ", to_string(D), + ", tensor_strides dimension: ", to_string(tensor_strides.size())); + + const bool is_identity = + std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; }); + + ASSERT(is_identity == false, + "Please check is_identity in getStridedInOutMaps"); + + const vector out_tensor_strides = + computeOutTensorStride(tensor_strides, strides, is_transpose); + + // Set the out_coords_key and return if a key already exists. + out_coords_key = hash_vec(out_tensor_strides); + if (force_creation) { + // set a random coords key if force creation is set + if (existsCoordsKey(out_coords_key)) + out_coords_key = getRandomCoordsKey(); + } else { + ASSERT(!existsCoordsKey(out_coords_key), + "createX will always come from getX, getX has handled this condition"); + } + + p_out_coords_key->setKey(out_coords_key); + + if (!p_out_coords_key->tensor_stride_set) { + p_out_coords_key->setTensorStride(tensor_strides); + p_out_coords_key->up_stride(strides); + } + + const InOutMapKey map_key = getMapHashKey( + tensor_strides, strides, kernel_sizes, dilations, region_type, + py_in_coords_key, py_out_coords_key, is_transpose, is_pool); + +// if (in_maps.find(map_key) != in_maps.end()) return; + ASSERT(in_maps.find(map_key) == in_maps.end(), + "out_coords_key is new, ins/outs maps have to be generated."); + + const auto nrows = coords_maps[in_coords_key]->nrows; + + // Create transposed coords map + Region region = Region(out_tensor_strides, kernel_sizes, dilations, + region_type, offsets.data(), offsets.size(0)); + +// in_maps[map_key] = vector(region.size(), + vector th_ins(region.size(), + torch::empty({static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32))); +// out_maps[map_key] = vector(region.size(), + vector th_outs(region.size(), + torch::empty({static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32))); + + vector p_ins(region.size()); + vector p_outs(region.size()); + for (size_t c = 0; c != region.size(); ++c) { + p_ins[c] = th_ins[c].data(); + p_outs[c] = th_outs[c].data(); + } + + float duplicate_factor = 1.0; + for (auto stride : strides) duplicate_factor *= stride; + coords_maps[out_coords_key] = std::make_shared>( + nrows, + duplicate_factor); + auto out_nrows = coords_maps[out_coords_key]->region_insert_search( + coords_maps[in_coords_key], + p_ins, p_outs, + region, nrows); + if (out_nrows < min_nrows) { + min_nrows = out_nrows; + min_coords_key = out_coords_key; + } + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + return map_key; +} + +template +uint64_t GPUCoordsManager::createTransposedStridedRegionCoords( + uint64_t coords_key, const vector &tensor_strides, + const vector &strides, vector kernel_sizes, vector dilations, + int region_type, at::Tensor offsets, bool force_creation) { + const vector out_tensor_strides = + computeOutTensorStride(tensor_strides, strides, true /* is_transpose */); + + // Basic assertions + ASSERT(existsCoordsKey(coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(coords_key), "."); + + const int D = coords_maps[coords_key]->ncols - 1; + ASSERT(D == tensor_strides.size(), "The coordinate dimensions mismatch. ", + "GPUCoordsManager dimension: ", to_string(D), + ", tensor_strides dimension: ", to_string(tensor_strides.size())); + + // Set the out_coords_key and return if a key already exists. + uint64_t out_coords_key = hash_vec(out_tensor_strides); + if (force_creation) { + // set a random coords key if force creation is set + if (existsCoordsKey(out_coords_key)) + out_coords_key = getRandomCoordsKey(); + } else if (existsCoordsKey(out_coords_key)) { + // Returnn if not force_creation and the key exists + return out_coords_key; + } + + // Create transposed coords map + Region region = Region(out_tensor_strides, kernel_sizes, dilations, + region_type, offsets.data(), offsets.size(0)); + + const int nrows = coords_maps[coords_key]->nrows; + float duplicate_factor = 1.0; + for (auto stride : strides) duplicate_factor *= stride; + coords_maps[out_coords_key] = std::make_shared>(nrows, + duplicate_factor); + auto out_nrows = coords_maps[out_coords_key]->region_insert( + coords_maps[coords_key], region, nrows); + + if (out_nrows < min_nrows) { + min_nrows = out_nrows; + min_coords_key = out_coords_key; + } + return out_coords_key; +} + +template +uint64_t GPUCoordsManager::createOriginCoords(const int D) { + const vector zero_tensor_strides(D, 0); + const uint64_t out_coords_key = hash_vec(zero_tensor_strides); + // If the coordinates already exists, return the key. + if (existsCoordsKey(out_coords_key)) + return out_coords_key; + + coords_maps[out_coords_key] = std::make_shared>(1, 1.0); + // TODO(ljm): implement batch_insert + batch_size = coords_maps[out_coords_key]->batch_insert(coords_maps[min_coords_key], + coords_maps[min_coords_key]->nrows); + if (batch_size < min_nrows) { + min_nrows = batch_size; + min_coords_key = out_coords_key; + } + return out_coords_key; +} + +template +long int GPUCoordsManager::getBatchSize() { + if (batch_size == -1) createOriginCoords(D); + return batch_size; +} + +template +const InOutMapKey GPUCoordsManager::getMapHashKey( + vector tensor_strides, vector strides, vector kernel_sizes, + vector dilations, int region_type, py::object py_in_coords_key, + py::object py_out_coords_key, bool is_transpose, bool is_pool) const { + const int D = tensor_strides.size(); + ASSERT(D == tensor_strides.size() and D == strides.size() and + D == kernel_sizes.size() and D == dilations.size(), + "Size mismatch. tensor_strides: ", tensor_strides.size(), + ", strides: ", strides.size(), ", kernel_sizes: ", kernel_sizes.size(), + ", dilations: ", dilations.size()); + + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + const uint64_t out_coords_key = p_out_coords_key->getKey(); + const uint64_t in_coords_key = p_in_coords_key->getKey(); + const uint64_t stride_hash = hash_vec(strides); + const uint64_t kernel_size_hash = hash_vec(kernel_sizes); + const uint64_t dilation_hash = hash_vec(dilations); + const InOutMapKey map_key = { + in_coords_key, out_coords_key, stride_hash, kernel_size_hash, + dilation_hash, (uint64_t)region_type, is_transpose, is_pool}; + + return map_key; +} + +template +const InOutMapKey GPUCoordsManager::getOriginMapHashKey( + py::object py_in_coords_key, py::object py_out_coords_key) const { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + ASSERT( + p_in_coords_key->key_set and p_out_coords_key->key_set, + "Key is not set. in_coords_key: ", to_string(p_in_coords_key->getKey()), + ", out_coords_key: ", to_string(p_out_coords_key->getKey())); + + const int D = p_in_coords_key->getDimension(); + + const uint64_t out_coords_key = p_out_coords_key->getKey(); + const uint64_t in_coords_key = p_in_coords_key->getKey(); + const vector zero_vec(D, 0); + const uint64_t zero_hash = hash_vec(zero_vec); + const InOutMapKey map_key = { + in_coords_key, out_coords_key, zero_hash, zero_hash, zero_hash, 0, false, + true}; + return map_key; +} + +template +const InOutMapKey +GPUCoordsManager::getUnionMapHashKey(vector py_in_coords_keys, + py::object py_out_coords_key) const { + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + ASSERT(py_in_coords_keys.size() > 1, "Number of input coords must be > 1"); + vector p_in_coords_keys; + // We use sum of coords key (even with overflow, it will be unique with high + // prob). We use sum to make the key invariant to the order of the keys. + uint64_t sum_in_coords_key = 0; + CoordsKey *p_in_coords_key = py_in_coords_keys[0].cast(); + for (auto &py_in_coords_key : py_in_coords_keys) { + p_in_coords_key = py_in_coords_key.cast(); + const uint64_t in_coords_key = p_in_coords_key->getKey(); + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + sum_in_coords_key += in_coords_key; + } + + ASSERT(p_out_coords_key->key_set, "Key is not set. out_coords_key: ", + to_string(p_out_coords_key->getKey())); + + const uint64_t out_coords_key = p_out_coords_key->getKey(); + const vector zero_vec(p_in_coords_key->getDimension(), 0); + const uint64_t zero_hash = hash_vec(zero_vec); + InOutMapKey map_key = {sum_in_coords_key, + out_coords_key, + zero_hash, + zero_hash, + zero_hash, + 0, + false, + true}; + return map_key; +} +/** + * Entry function for coords map generation and the associated kernel maps. + */ +template +const InOutMapKey +GPUCoordsManager::getInOutMaps( + const vector &tensor_strides, const vector &strides, + const vector &kernel_sizes, const vector &dilations, + int region_type, const at::Tensor &offsets, py::object py_in_coords_key, + py::object py_out_coords_key, bool is_transpose, bool is_pool, + bool force_creation) { + // + // Warning(ljm): In the GPU version, when `is_transpose == True`, + // the `Filp` ins/outs maps generation in CPU is not used. + // It is same as the `non-Flip` version except when `is_pool == True` + // and `kernel_size` is even, there will be a little bit of difference + // of the ins/outs maps. But it does not affect the math meaning, + // just like the customized `region` based `Sparse Convolution` + // compared with the classic version similar as the normal + // `Convolution` in torch as `spconv` does. + // By the way, the `non-Flip` GPU implementation could merge the + // insert and search operation which reduce the region iteration + // times from two to one which should be a big optimization. + // + // Another remark: + // By the logic of the implementation in CPU version, + // the following ins/outs flip cache will never hit. + // in_maps[map_key] = out_maps[tmp_map_key]; + // out_maps[map_key] = in_maps[tmp_map_key]; + // So we do not check it in GPU version. + // By the way, if it hits, the `non-Flip` implementation in GPU version + // will catch it automatically. Although it will never hit. + // + const int D = tensor_strides.size(); + ASSERT(D == tensor_strides.size() and D == strides.size() and + D == kernel_sizes.size() and D == dilations.size(), + "Size mismatch. tensor_strides: ", tensor_strides.size(), + ", strides: ", strides.size(), ", kernel_sizes: ", kernel_sizes.size(), + ", dilations: ", dilations.size()); + ASSERT(std::all_of(tensor_strides.begin(), tensor_strides.end(), + [](int k) { return k > 0; }), + "Invalid tensor_strides: ", ArrToString(tensor_strides), + " Tensor strides must be positive integers."); + + if (!is_transpose) { + // TODO(ljm): track the update in cpu version + // TODO: even numbered kernel size to use region_type 0 + if (is_pool && (strides == kernel_sizes)) { + return getStridedInOutMaps( + py_in_coords_key, py_out_coords_key, + tensor_strides, strides, + kernel_sizes, dilations, region_type, + is_transpose, is_pool, + force_creation); + } else { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + // Will return the in_coords_key if strides == 1. + auto out_coords_key = createStridedCoords( + p_in_coords_key->getKey(), tensor_strides, strides, force_creation); + + p_out_coords_key->setKey(out_coords_key); + if (!p_out_coords_key->tensor_stride_set) { + p_out_coords_key->setTensorStride(tensor_strides); + p_out_coords_key->up_stride(strides); + } + + // use Tranposed to generate Non-Tranpose + // Flip is needed. + // But, Filp equals to Non-Flip when kernel is symmetric. + // And, it has little differece when kernel is non-symmetric. + return getTransposedStridedRegionInOutMaps( + py_in_coords_key, py_out_coords_key, + tensor_strides, strides, kernel_sizes, + dilations, region_type, + is_transpose, is_pool, + offsets, false); + } + } else { + const bool is_identity = + std::all_of(strides.begin(), strides.end(), [](int s) { return s == 1; }); + ASSERT(is_identity == false, + "It is meaningless of identity in transpose conv"); + + if (is_pool && strides == kernel_sizes && region_type == 0) { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + auto out_coords_key = createTransposedStridedRegionCoords( + p_in_coords_key->getKey(), tensor_strides, strides, kernel_sizes, + dilations, region_type, offsets, force_creation); + + p_out_coords_key->setKey(out_coords_key); + if (!p_out_coords_key->tensor_stride_set) { + p_out_coords_key->setTensorStride(tensor_strides); + p_out_coords_key->up_stride(strides); + } + + return getStridedInOutMaps( + py_in_coords_key, py_out_coords_key, + tensor_strides, strides, + kernel_sizes, dilations, region_type, + is_transpose, is_pool, + false); + } else { + return getTransposedStridedRegionInOutMaps( + py_in_coords_key, py_out_coords_key, + tensor_strides, strides, kernel_sizes, + dilations, region_type, + is_transpose, is_pool, + offsets, force_creation); + } + } +} + +template +const InOutMapKey +GPUCoordsManager::getOriginInOutMaps(py::object py_in_coords_key, + py::object py_out_coords_key) { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + + const int D = p_in_coords_key->getDimension(); + // Create output coordinates if it doesn't exist + if (!p_out_coords_key->key_set) { + p_out_coords_key->setKey(createOriginCoords(D)); + const vector zero_vec(D, 0); + p_out_coords_key->setTensorStride(zero_vec); + } + + const uint64_t in_coords_key = p_in_coords_key->getKey(); + const uint64_t out_coords_key = p_out_coords_key->getKey(); + + // Map key for origin hash map + const InOutMapKey map_key = + getOriginMapHashKey(py_in_coords_key, py_out_coords_key); + + // For non transpose case + // make a kernel mapping. The kernel will be saved with the map_key. + if (in_maps.find(map_key) == in_maps.end()) { + ASSERT(coords_maps[out_coords_key]->size() == batch_size, + "Coords size mismatch. GPUCoordsMap size: ", + coords_maps[out_coords_key]->size(), + ", batch size: ", batch_size); + const auto nrows = coords_maps[in_coords_key]->nrows; + vector th_ins(1, torch::empty( + {static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32))); + vector th_outs(1, torch::empty( + {static_cast(nrows)}, + torch::TensorOptions().dtype(torch::kInt32))); + int* p_in = th_ins[0].data(); + int* p_out = th_outs[0].data(); + coords_maps[out_coords_key]->batch_search( + coords_maps[in_coords_key], + p_in, p_out, nrows); + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + } + return map_key; +} + +template +pair, vector> +GPUCoordsManager::getUnionMap(vector py_in_coords_keys, + py::object py_out_coords_key) { + + // all exception handling will be done inside the following + const auto map_key = getUnionInOutMaps(py_in_coords_keys, py_out_coords_key); + + return {in_maps[map_key], out_maps[map_key]}; +} + +// WARNING(ljm): this is not a in-use function +template +uint64_t +GPUCoordsManager::createUnionCoords(vector py_in_coords_keys, + py::object py_out_coords_key) { + + //vector>> in_coords_maps(py_in_coords_keys.size()); + vector>> in_coords_maps(py_in_coords_keys.size()); + vector in_coords_map_sizes(py_in_coords_keys.size()); + CoordsKey *p_in_coords_key = py_in_coords_keys[0].cast(); + auto tensor_strides = p_in_coords_key->getTensorStride(); + //GPUCoordsMap& curr_map = coords_maps[p_in_coords_key->getKey()]; + in_coords_maps[0] = coords_maps[p_in_coords_key->getKey()]; + in_coords_map_sizes[0] = in_coords_maps[0]->nrows; + int total_in_keys = in_coords_map_sizes[0]; + for (size_t i = 1; i != py_in_coords_keys.size(); ++i) { + // Set the tensor strides to the smallest elements. + p_in_coords_key = py_in_coords_keys[i].cast(); + transform(tensor_strides.begin(), /* In1 begin */ + tensor_strides.end(), /* In1 end */ + p_in_coords_key->getTensorStride().begin(), /* In2 begin */ + tensor_strides.begin(), /* out begin */ + [](int a, int b) -> int { return std::min(a, b); } /* binary op */ + ); + in_coords_maps[i] = coords_maps[p_in_coords_key->getKey()]; + in_coords_map_sizes[i] = in_coords_maps[i]->nrows; + total_in_keys += in_coords_map_sizes[i]; + + const uint64_t in_coords_key = p_in_coords_key->getKey(); + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + } + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + + // set a random coords key + const uint64_t out_coords_key = getRandomCoordsKey(); + + // Set the pycoordskey using the last coords_key + p_out_coords_key->setDimension(p_in_coords_key->getDimension()); + p_out_coords_key->setKey(out_coords_key); + p_out_coords_key->setTensorStride(tensor_strides); + + coords_maps[out_coords_key] = + std::make_shared>(total_in_keys, 1.0 / in_coords_map_sizes.size()); + + auto out_nrows = coords_maps[out_coords_key]->union_insert(in_coords_maps, + in_coords_map_sizes); + + if (out_nrows < min_nrows) { + min_nrows = out_nrows; + min_coords_key = out_coords_key; + } + return out_coords_key; +} + +template +const InOutMapKey +GPUCoordsManager::createUnionInOutMaps(const vector& py_in_coords_keys, + py::object py_out_coords_key) { + + //vector>> in_coords_maps(py_in_coords_keys.size()); + vector>> in_coords_maps(py_in_coords_keys.size()); + vector in_coords_map_sizes(py_in_coords_keys.size()); + vector th_ins(py_in_coords_keys.size()); + vector th_outs(py_in_coords_keys.size()); + vector p_ins(py_in_coords_keys.size()); + vector p_outs(py_in_coords_keys.size()); + CoordsKey *p_in_coords_key = py_in_coords_keys[0].cast(); + auto tensor_strides = p_in_coords_key->getTensorStride(); + in_coords_maps[0] = coords_maps[p_in_coords_key->getKey()]; + in_coords_map_sizes[0] = in_coords_maps[0]->nrows; + int total_in_keys = in_coords_map_sizes[0]; + th_ins[0] = torch::empty( + {static_cast(in_coords_map_sizes[0])}, torch::TensorOptions().dtype(torch::kInt32)); + th_outs[0] = torch::empty( + {static_cast(in_coords_map_sizes[0])}, torch::TensorOptions().dtype(torch::kInt32)); + p_ins[0] = th_ins[0].data(); + p_outs[0] = th_outs[0].data(); + for (size_t i = 1; i != py_in_coords_keys.size(); ++i) { + // Set the tensor strides to the smallest elements. + p_in_coords_key = py_in_coords_keys[i].cast(); + transform(tensor_strides.begin(), /* In1 begin */ + tensor_strides.end(), /* In1 end */ + p_in_coords_key->getTensorStride().begin(), /* In2 begin */ + tensor_strides.begin(), /* out begin */ + [](int a, int b) -> int { return std::min(a, b); } /* binary op */ + ); + in_coords_maps[i] = coords_maps[p_in_coords_key->getKey()]; + in_coords_map_sizes[i] = in_coords_maps[i]->nrows; + total_in_keys += in_coords_map_sizes[i]; + th_ins[i] = torch::empty( + {static_cast(in_coords_map_sizes[i])}, torch::TensorOptions().dtype(torch::kInt32)); + th_outs[i] = torch::empty( + {static_cast(in_coords_map_sizes[i])}, torch::TensorOptions().dtype(torch::kInt32)); + p_ins[i] = th_ins[i].data(); + p_outs[i] = th_outs[i].data(); + + const uint64_t in_coords_key = p_in_coords_key->getKey(); + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + } + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + + // set a random coords key + const uint64_t out_coords_key = getRandomCoordsKey(); + + // Set the pycoordskey using the last coords_key + p_out_coords_key->setDimension(p_in_coords_key->getDimension()); + p_out_coords_key->setKey(out_coords_key); + p_out_coords_key->setTensorStride(tensor_strides); + + coords_maps[out_coords_key] = + std::make_shared>(total_in_keys, 1.0 / in_coords_map_sizes.size()); + + auto out_nrows = coords_maps[out_coords_key]->union_insert_search(in_coords_maps, + p_ins, p_outs, + in_coords_map_sizes); + + if (out_nrows < min_nrows) { + min_nrows = out_nrows; + min_coords_key = out_coords_key; + } + // Map key for origin hash map + const InOutMapKey map_key = + getUnionMapHashKey(py_in_coords_keys, py_out_coords_key); + + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + + return map_key; +} + +template +const InOutMapKey +GPUCoordsManager::getUnionInOutMaps(vector py_in_coords_keys, + py::object py_out_coords_key) { + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + + // Create output coordinates if it doesn't exist + if (!p_out_coords_key->key_set) + return createUnionInOutMaps(py_in_coords_keys, py_out_coords_key); + + const uint64_t out_coords_key = p_out_coords_key->getKey(); + + // Map key for origin hash map + const InOutMapKey map_key = + getUnionMapHashKey(py_in_coords_keys, py_out_coords_key); + + if (in_maps.find(map_key) == in_maps.end()) { + //vector>> in_coords_maps(py_in_coords_keys.size()); + vector>> in_coords_maps(py_in_coords_keys.size()); + vector in_coords_map_sizes(py_in_coords_keys.size()); + vector th_ins(py_in_coords_keys.size()); + vector th_outs(py_in_coords_keys.size()); + vector p_ins(py_in_coords_keys.size()); + vector p_outs(py_in_coords_keys.size()); + CoordsKey *p_in_coords_key = py_in_coords_keys[0].cast(); + auto tensor_strides = p_in_coords_key->getTensorStride(); + in_coords_maps[0] = coords_maps[p_in_coords_key->getKey()]; + in_coords_map_sizes[0] = in_coords_maps[0]->nrows; + int total_in_keys = in_coords_map_sizes[0]; + th_ins[0] = torch::empty( + {static_cast(in_coords_map_sizes[0])}, torch::TensorOptions().dtype(torch::kInt32)); + th_outs[0] = torch::empty( + {static_cast(in_coords_map_sizes[0])}, torch::TensorOptions().dtype(torch::kInt32)); + p_ins[0] = th_ins[0].data(); + p_outs[0] = th_outs[0].data(); + for (size_t i = 1; i != py_in_coords_keys.size(); ++i) { + // Set the tensor strides to the smallest elements. + p_in_coords_key = py_in_coords_keys[i].cast(); + transform(tensor_strides.begin(), /* In1 begin */ + tensor_strides.end(), /* In1 end */ + p_in_coords_key->getTensorStride().begin(), /* In2 begin */ + tensor_strides.begin(), /* out begin */ + [](int a, int b) -> int { return std::min(a, b); } /* binary op */ + ); + in_coords_maps[i] = coords_maps[p_in_coords_key->getKey()]; + in_coords_map_sizes[i] = in_coords_maps[i]->nrows; + total_in_keys += in_coords_map_sizes[i]; + th_ins[i] = torch::empty( + {static_cast(in_coords_map_sizes[i])}, torch::TensorOptions().dtype(torch::kInt32)); + th_outs[i] = torch::empty( + {static_cast(in_coords_map_sizes[i])}, torch::TensorOptions().dtype(torch::kInt32)); + p_ins[i] = th_ins[i].data(); + p_outs[i] = th_outs[i].data(); + + const uint64_t in_coords_key = p_in_coords_key->getKey(); + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + } + + coords_maps[out_coords_key]->union_search(in_coords_maps, + p_ins, p_outs, + in_coords_map_sizes); + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + } + + return map_key; +} + +template +uint64_t +GPUCoordsManager::createPruningCoords(at::Tensor use_feat, + py::object py_in_coords_key, + py::object py_out_coords_key) { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + ASSERT(!p_out_coords_key->isKeySet(), + "p_out_coords_key should be unsetted"); + + const uint64_t in_coords_key = p_in_coords_key->getKey(); + + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + + // set a random coords key + const uint64_t out_coords_key = getRandomCoordsKey(); + + // Set the pycoordskey + p_out_coords_key->setKey(out_coords_key); + p_out_coords_key->setDimension(p_in_coords_key->getDimension()); + if (!p_out_coords_key->tensor_stride_set) + p_out_coords_key->setTensorStride(p_in_coords_key->getTensorStride()); + + coords_maps[out_coords_key] = std::make_shared>(use_feat.size(0)); + auto out_nrows = coords_maps[out_coords_key]->prune_insert(coords_maps[in_coords_key], + use_feat.data(), + use_feat.size(0), + coords_maps[in_coords_key]->nrows); + + if (out_nrows < min_nrows) { + min_nrows = out_nrows; + min_coords_key = out_coords_key; + } + return out_coords_key; +} + +template +const InOutMapKey +GPUCoordsManager::createPruningInOutMaps(at::Tensor use_feat, + py::object py_in_coords_key, + py::object py_out_coords_key) { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + ASSERT(!p_out_coords_key->isKeySet(), + "p_out_coords_key should be unsetted"); + + const uint64_t in_coords_key = p_in_coords_key->getKey(); + ASSERT(existsCoordsKey(in_coords_key), + "The coord map doesn't exist for the given coords_key: ", + to_string(in_coords_key), "."); + + const uint64_t out_coords_key = getRandomCoordsKey(); + p_out_coords_key->setKey(out_coords_key); + p_out_coords_key->setDimension(p_in_coords_key->getDimension()); + if (!p_out_coords_key->tensor_stride_set) + p_out_coords_key->setTensorStride(p_in_coords_key->getTensorStride()); + + const InOutMapKey map_key = + getOriginMapHashKey(py_in_coords_key, py_out_coords_key); + + if (in_maps.find(map_key) != in_maps.end()) return map_key; + + vector th_ins(1, torch::empty( + {static_cast(use_feat.size(0))}, + torch::TensorOptions().dtype(torch::kInt32))); + vector th_outs(1, torch::empty( + {static_cast(use_feat.size(0))}, + torch::TensorOptions().dtype(torch::kInt32))); + + int* p_in = th_ins[0].data(); + int* p_out = th_outs[0].data(); + + coords_maps[out_coords_key] = std::make_shared>(use_feat.size(0)); + auto out_nrows = coords_maps[out_coords_key]->prune_insert_search(coords_maps[in_coords_key], + p_in, p_out, + use_feat.data(), + use_feat.size(0), + coords_maps[in_coords_key]->nrows); + + if (out_nrows < min_nrows) { + min_nrows = out_nrows; + min_coords_key = out_coords_key; + } + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + return map_key; +} + +template +const InOutMapKey +GPUCoordsManager::getPruningInOutMaps(at::Tensor use_feat, + py::object py_in_coords_key, + py::object py_out_coords_key) { + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + CoordsKey *p_out_coords_key = py_out_coords_key.cast(); + + // Create output coordinates if it doesn't exist + if (!p_out_coords_key->key_set) { + // The following function setup py_out_coords_key + return createPruningInOutMaps(use_feat, py_in_coords_key, py_out_coords_key); + } + + const uint64_t in_coords_key = p_in_coords_key->getKey(); + const uint64_t out_coords_key = p_out_coords_key->getKey(); + + // Use the map key for origin hash map (stride, dilation, kernel are all + // NULL) + const InOutMapKey map_key = + getOriginMapHashKey(py_in_coords_key, py_out_coords_key); + + // For non transpose case + // make a kernel mapping. The kernel will be saved with the map_key. + if (in_maps.find(map_key) == in_maps.end()) { + vector th_ins(1, torch::empty( + {static_cast(use_feat.size(0) + 1)}, + torch::TensorOptions().dtype(torch::kInt32))); + vector th_outs(1, torch::empty( + {static_cast(use_feat.size(0))}, + torch::TensorOptions().dtype(torch::kInt32))); + + int* p_in = th_ins[0].data(); + int* p_out = th_outs[0].data(); + +// coords_maps[out_coords_key] = GPUCoordsMap(use_feat.size(0)); + coords_maps[out_coords_key]->prune_search(coords_maps[in_coords_key], + p_in, p_out, + use_feat.data(), + use_feat.size(0), + coords_maps[in_coords_key]->nrows); + int size = *(p_in + use_feat.size(0)); + th_ins[0].resize_(size); + th_outs[0].resize_(size); + in_maps[map_key] = move(th_ins); + out_maps[map_key] = move(th_outs); + } + + return map_key; +} + + +template string GPUCoordsManager::toString() const { + Formatter out; + out << "< GPUCoordsManager\n\tNumber of Coordinate Maps: " + << to_string(coords_maps.size()); + for (const auto &kv : coords_maps) { + out << " \n\t\tCoordinate Map Key: " << to_string(kv.first) + << ", Size: " << to_string((kv.second)->size()); + } + out << "\n\tNumber of Kernel Maps: " << to_string(in_maps.size()); + for (const auto &kv : in_maps) { + size_t size = 0; + for (const auto &map : kv.second) + size += map.size(0); + out << " \n\t\tKernel In-Out Map Key: " + << to_string(hash_vec(kv.first)) + << ", Size: " << to_string(size); + } + out << " >\n"; + return out; +} + +// TODO(ljm): implement GPUCoordsMap::print +/* +template +void GPUCoordsManager::printDiagnostics(py::object py_coords_key) const { + CoordsKey *p_coords_key = py_coords_key.cast(); + const auto &map_iter = coords_maps.find(p_coords_key->getKey()); + ASSERT(map_iter != coords_maps.end(), "Coords map does not exist."); + map_iter->second.print(); +} +*/ + +/* + * Return row indices for each batch index + */ +template +at::Tensor +GPUCoordsManager::getRowIndicesAtBatchIndex(py::object py_in_coords_key, + py::object py_out_coords_key, + const int batch_index) { + // py_out_coords_key will be set after the above call. + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + const auto in_coords_key = p_in_coords_key->getKey(); + const auto in_map_iter = coords_maps.find(in_coords_key); + ASSERT(in_map_iter != coords_maps.end(), + "The in_coords_key, ", to_string(in_coords_key), ", does not exist."); + + const auto& coordsmap = in_map_iter->second; + const auto nrows = coordsmap->nrows; + //const auto batch_num = coordsmap->get_batch_num(); + const auto batch_num = getBatchSize(); + ASSERT(batch_index < batch_num, "batch_index: ", to_string(batch_index), + " must smaller than batch_num: ", to_string(batch_num)); + + at::Tensor out_ind = torch::zeros( + {static_cast(nrows + 1)}, torch::TensorOptions().dtype(torch::kInt32).device(device)); + int* p_out_ind = out_ind.data(); + //coordsmap.GetIndexAtBatch(p_out_ind, batch_index); + coordsmap->get_index_at_batch(p_out_ind, batch_index, nrows); + int size = *(p_out_ind + nrows); + out_ind.resize_({size}); + //out_ind.resize_(c10::IntArrayRef(reinterpret_cast(p_out_ind + nrows), 1)); + + return out_ind; +} + +/* + * Return row indices per batch + */ +template +vector +GPUCoordsManager::getRowIndicesPerBatch(py::object py_in_coords_key, + py::object py_out_coords_key) { + // py_out_coords_key will be set after the above call. + CoordsKey *p_in_coords_key = py_in_coords_key.cast(); + const auto in_coords_key = p_in_coords_key->getKey(); + const auto in_map_iter = coords_maps.find(in_coords_key); + ASSERT(in_map_iter != coords_maps.end(), + "The in_coords_key, ", to_string(in_coords_key), ", does not exist."); + + const auto& coordsmap = in_map_iter->second; +// const auto batch_num = coordsmap->get_batch_num(); + const auto batch_num = getBatchSize(); + const auto nrows = coordsmap->nrows; + // Return index. + vector out_inds(batch_num, torch::zeros( + {static_cast(nrows + 1)}, torch::TensorOptions().dtype(torch::kInt32))); + vector p_out_inds(batch_num); + for (size_t b = 0; b != batch_num; ++b) p_out_inds[b] = out_inds[b].data(); + //coordsmap.GetIndexPerBatch(p_out_inds); + coordsmap->get_index_per_batch(p_out_inds, nrows); + for (size_t b = 0; b != batch_num; ++b) { + int size = *(p_out_inds[b] + nrows); + out_inds[b].resize_({size}); +// out_inds[b].resize_(c10::IntArrayRef(reinterpret_cast(p_out_inds[b] + nrows), 1)); + } + + return out_inds; +} + +template class GPUCoordsManager; + +} // end namespace minkowski diff --git a/src/gpu_coords_manager.hpp b/src/gpu_coords_manager.hpp new file mode 100644 index 00000000..bdb8b644 --- /dev/null +++ b/src/gpu_coords_manager.hpp @@ -0,0 +1,291 @@ +/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu). + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural + * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part + * of the code. + */ +#ifndef GPU_COORDS_MAN +#define GPU_COORDS_MAN + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "gpu_coordsmap.hpp" +#include "types.hpp" +#include "utils.hpp" + +#ifndef CPU_ONLY +#include "gpu_memory_manager.hpp" +#include +#endif // CPU_ONLY + +namespace minkowski { + +using std::begin; +using std::end; +using std::get; +using std::move; +using std::ref; +using std::string; +using std::to_string; +using std::unordered_map; + +/* +inline vector computeOutTensorStride(const vector &tensor_strides, + const vector &strides, + bool is_transpose) { + vector out_tensor_strides; + ASSERT(tensor_strides.size() == strides.size(), + "The dimension of tensor_stride: ", ArrToString(tensor_strides), + " does not match the dimension of strides: ", ArrToString(strides)); + for (size_t i = 0; i < strides.size(); i++) { + if (is_transpose) { + ASSERT(tensor_strides[i] % strides[i] == 0, + "The output tensor stride is not divisible by ", + "up_strides. tensor stride: ", ArrToString(tensor_strides), + ", up_strides: ", ArrToString(strides)); + out_tensor_strides.push_back(tensor_strides[i] / strides[i]); + } else + out_tensor_strides.push_back(tensor_strides[i] * strides[i]); + } + return out_tensor_strides; +} +*/ + +#ifndef CPU_ONLY + +template int getInOutMapsSizeGPU(const VType &map) { + int n = 0; + for (auto cmap = begin(map); cmap != end(map); ++cmap) + n += cmap->size(0); + return n; +} + +template class GPUCoordsManager { +public: + // Variables + // + // Coordinate hash key to coordinate hash map + unordered_map>> coords_maps; + + set batch_indices; + int batch_size; + int D; + int device_id; + c10::Device device; + int min_nrows; + uint64_t min_coords_key; + + std::shared_ptr gpu_memory_manager; + + // In to out index mapping for each kernel, pooling + unordered_map, InOutMapKeyHash> in_maps; + unordered_map, InOutMapKeyHash> out_maps; + + GPUCoordsManager(int D, + int device_id, + MemoryManagerBackend backend) : batch_size(-1), device(c10::DeviceType::CUDA, 0) { + gpu_memory_manager = std::make_shared(backend); + this->device_id = device_id; + this->D = D; + min_nrows = INT_MAX; + } + ~GPUCoordsManager() { clear(); } + +// TODO(ljm): implement GPUCoordsMap::print +// void printDiagnostics(py::object py_coords_key) const; + + uint64_t getCoordsKey(const vector &tensor_strides) const; + bool existsCoordsKey(uint64_t coords_key) const; + bool existsCoordsKey(py::object py_coords_key) const; + bool existsInOutMapKey(const InOutMapKey &map_key) const { + return in_maps.find(map_key) != in_maps.end(); + } + int getCoordsSize(uint64_t coords_key) const; + int getCoordsSize(py::object py_coords_key) const; + uint64_t getRandomCoordsKey(); + long int getBatchSize(); + set getBatchIndices() { + if (batch_indices.empty()) { + for (int b = 0; b != getBatchSize(); ++b) batch_indices.insert(b); + } + ASSERT((int)batch_indices.size() == getBatchSize(), + "batch_indices.size() must be equal to getBatchSize()"); + return batch_indices; + } + void getCoords(at::Tensor coords, py::object py_coords_key) const; + vector> + getKernelMap(const vector& tensor_strides, const vector& strides, + const vector& kernel_sizes, const vector& dilations, int region_type, + at::Tensor offsets, py::object py_in_coords_key, + py::object py_out_coords_key, bool is_transpose, bool is_pool); + // TODO make this function non-const with ability to generate a new map + vector getCoordsMap(py::object py_in_coords_key, + py::object py_out_coords_key) const; + pair, vector> + getUnionMap(vector py_in_coords_keys, + py::object py_out_coords_key); + + // Set the py_coords_key to the origin coords map key + void setOriginCoordsKey(py::object py_coords_key); + + // New coords map initialzation entry + uint64_t initializeCoords(at::Tensor coords, at::Tensor mapping, + at::Tensor inverse_mapping, + const vector &tensor_strides, + const bool force_creation, const bool force_remap, + const bool allow_duplicate_coords, + const bool return_inverse); + + uint64_t initializeCoords(at::Tensor coords, at::Tensor mapping, + at::Tensor inverse_mapping, + py::object py_coords_key, const bool force_creation, + const bool force_remap, + const bool allow_duplicate_coords, + const bool return_inverse); + + // New coords map given an input + uint64_t createStridedCoords(uint64_t coords_key, + const vector &tensor_strides, + const vector &strides, bool force_creation); + uint64_t createTransposedStridedRegionCoords( + uint64_t coords_key, const vector &tensor_strides, + const vector &strides, vector kernel_sizes, + vector dilations, int region_type, at::Tensor offsets, + bool force_creation); + uint64_t createPruningCoords(at::Tensor use_feat, py::object py_in_coords_key, + py::object py_out_coords_key); + uint64_t createOriginCoords(const int D); + uint64_t createUnionCoords(vector py_in_coords_keys, + py::object py_out_coords_key); + + // Mappings + const InOutMapKey getMapHashKey(vector tensor_strides, + vector strides, vector kernel_sizes, + vector dilations, int region_type, + py::object py_in_coords_key, + py::object py_out_coords_key, + bool is_transpose, bool is_pool) const; + const InOutMapKey getOriginMapHashKey(py::object py_in_coords_key, + py::object py_out_coords_key) const; + const InOutMapKey getUnionMapHashKey(vector py_in_coords_keys, + py::object py_out_coords_key) const; + + // Wrapper functions for setting up coords and returning maps + const InOutMapKey + getInOutMaps(const vector &tensor_strides, const vector &strides, + const vector &kernel_sizes, const vector &dilations, + int region_type, const at::Tensor &offsets, + py::object py_in_coords_key, py::object py_out_coords_key, + bool is_transpose, bool is_pool = false, + bool generate_new_coords = false); + + const InOutMapKey getOriginInOutMaps(py::object py_in_coords_key, + py::object py_out_coords_key); + + const InOutMapKey getPruningInOutMaps(at::Tensor use_feat, + py::object py_in_coords_key, + py::object py_out_coords_key); + + const InOutMapKey + getUnionInOutMaps(vector py_in_coords_keys, + py::object py_out_coords_key); + + const InOutMapKey + getStridedInOutMaps( + py::object py_in_coords_key, py::object py_out_coords_key, + const vector& tensor_strides, const vector& strides, + const vector& kernel_sizes, const vector& dilations, int region_type, + bool is_transpose, bool is_pool, + bool force_creation); + + const InOutMapKey + createStridedInOutMaps( + py::object py_in_coords_key, py::object py_out_coords_key, + const vector &tensor_strides, + const vector &strides, + vector kernel_sizes, vector dilations, int region_type, + bool is_transpose, bool is_pool, + bool force_creation); + + const InOutMapKey + getTransposedStridedRegionInOutMaps( + py::object py_in_coords_key, py::object py_out_coords_key, + const vector& tensor_strides, + const vector& strides, const vector& kernel_sizes, const vector& dilations, + int region_type, + bool is_transpose, bool is_pool, + at::Tensor offsets, + bool force_creation); + + const InOutMapKey + createTransposedStridedRegionInOutMaps( + py::object py_in_coords_key, py::object py_out_coords_key, + const vector& tensor_strides, + const vector& strides, const vector& kernel_sizes, const vector& dilations, + int region_type, + bool is_transpose, bool is_pool, + at::Tensor offsets, bool force_creation); + + const InOutMapKey + createUnionInOutMaps(const vector& py_in_coords_keys, + py::object py_out_coords_key); + + const InOutMapKey + createPruningInOutMaps(at::Tensor use_feat, + py::object py_in_coords_key, + py::object py_out_coords_key); + + string toString() const; + void clear() { + coords_maps.clear(); + in_maps.clear(); + out_maps.clear(); + } + + at::Tensor getRowIndicesAtBatchIndex(py::object py_in_coords_key, + py::object py_out_coords_key, + const int batch_index); + vector getRowIndicesPerBatch(py::object py_in_coords_key, + py::object py_out_coords_key); + + void *getScratchGPUMemory(size_t size) { + return gpu_memory_manager.get()->tmp_data(size); + } + + void clearScratchGPUMemory() { gpu_memory_manager.get()->clear_tmp(); } + +}; // gpucoordsmanager +#endif + +} // namespace minkowski + +#endif // GPU_COORDS_MAN diff --git a/src/gpu_coordsmap.cpp b/src/gpu_coordsmap.cpp new file mode 100644 index 00000000..874afbad --- /dev/null +++ b/src/gpu_coordsmap.cpp @@ -0,0 +1,308 @@ +/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu). + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural + * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part + * of the code. + */ +#include +#include +#include + +#include "gpu_coordsmap.hpp" + +namespace minkowski { + +/* + * Use this function when batch_size is setted outside +template +GPUCoordsMap::GPUCoordsMap(int ncols_, int batch_size) + : nrows(batch_size), ncols(ncols_) { + map->BulkBatchIndiceInsert(ncols_, batch_size); +} +*/ + +// TODO(ljm): add prune_insert, prune_insert_search, prune_search + +/* +template +GPUCoordsMap::GPUCoordsMap(uint32_t map_size, float duplicate_factor, + uint32_t keys_per_bucket=62, const uint32_t device_id=0) { + // TODO(ljm): add this api + map->reserve(map_size, duplicate_factor, + keys_per_bucket, device_id); +} +*/ + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::initialize_batch(const int* p_coords, + int* p_mapping, + int* p_inverse_mapping, + const int nrows_, + const int ncols_, const bool force_remap, + const bool return_inverse) { + nrows = nrows_; + ncols = ncols_; + + map->BulkInsert(p_coords, p_mapping, p_inverse_mapping, nrows, ncols); + + nrows = map->Size(); + return nrows; +} + +template +void GPUCoordsMap::get_coords(int* p_coords, int size) { + map->IterateKeys(p_coords, size); +} + +template +void GPUCoordsMap::get_index_at_batch(int* p_out, + int batch_index, + int nrows_) { + map->IterateSearchAtBatch(p_out, batch_index, nrows_); +} + +template +void GPUCoordsMap::get_index_per_batch( + const vector& p_outs, + int nrows_) { + map->IterateSearchPerBatch(p_outs, nrows_); +} + +template +typename GPUCoordsMap::value_type +//GPUCoordsMap::region_insert(const GPUCoordsMap& in_coords_map, +GPUCoordsMap::region_insert(const std::shared_ptr>& in_coords_map, + const Region ®ion, int size) { + ASSERT(region.tensor_strides.size() == ncols - 1, "Invalid tensor strides"); + + vector offsets(region.size(), torch::empty( + {static_cast(ncols)}, torch::TensorOptions().dtype(torch::kInt32))); + vector origin(ncols, 0); + Region cregion(region); + cregion.set_bounds(origin); + int c = 0; + for (const auto& point : cregion) { + CHECK_CUDA(cudaMemcpy(offsets[c].data(), point.data(), + sizeof(int) * ncols, + cudaMemcpyHostToDevice)); + map->IterateOffsetInsert(in_coords_map->map, + //map.IterateOffsetInsert(map, + offsets[c].data(), + in_coords_map->nrows); + ++c; + } + nrows = map->Size(); + return nrows; +} + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::region_insert_search(const shared_ptr>& in_coords_map, + const vector& p_ins, + const vector& p_outs, + const Region ®ion, + int size) { + ASSERT(region.tensor_strides.size() == ncols - 1, "Invalid tensor strides"); + + vector offsets(region.size(), torch::empty( + {static_cast(ncols)}, torch::TensorOptions().dtype(torch::kInt32))); + vector origin(ncols, 0); + Region cregion(region); + cregion.set_bounds(origin); + int c = 0; + for (const auto& point : cregion) { + CHECK_CUDA(cudaMemcpy(offsets[c].data(), point.data(), + sizeof(int) * ncols, + cudaMemcpyHostToDevice)); + map->IterateOffsetInsertWithInsOuts(in_coords_map->map, + offsets[c].data(), + p_ins[c], p_outs[c], + size); + ++c; + } + nrows = map->Size(); + return nrows; +} + +template +void +GPUCoordsMap::region_search(const shared_ptr>& in_coords_map, + const vector& p_ins, + const vector& p_outs, + const Region ®ion, + int size) { + ASSERT(region.tensor_strides.size() == ncols - 1, "Invalid tensor strides"); + + vector offsets(region.size(), torch::empty( + {static_cast(ncols)}, torch::TensorOptions().dtype(torch::kInt32))); + vector origin(ncols, 0); + Region cregion(region); + cregion.set_bounds(origin); + int c = 0; + for (const auto& point : cregion) { + CHECK_CUDA(cudaMemcpy(offsets[c].data(), point.data(), + sizeof(int) * ncols, + cudaMemcpyHostToDevice)); + map->IterateOffsetSearch(in_coords_map->map, + offsets[c].data(), + p_ins[c], p_outs[c], + size); + ++c; + } +} + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::batch_insert(const shared_ptr>& in_coords_map, + int size) { + map->IterateBatchInsert(in_coords_map->map, size); + nrows = map->Size(); + return nrows; +} + +template +void +GPUCoordsMap::batch_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, int size) { + map->IterateBatchSearch(in_coords_map->map, p_in, p_out, size); +} + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::prune_insert(const shared_ptr>& in_coords_map, + bool* p_keep, int keep_size, + int size) { + map->IteratePruneInsert(in_coords_map->map, p_keep, keep_size, size); + nrows = map->Size(); + return nrows; +} + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::prune_insert_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, + bool* p_keep, int keep_size, + int size) { + map->IteratePruneInsertWithInOut(in_coords_map->map, + p_in, p_out, + p_keep, keep_size, size); + nrows = map->Size(); + return nrows; +} + +template +void +GPUCoordsMap::prune_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, + bool* p_keep, int keep_size, + int size) { + map->IteratePruneSearch(in_coords_map->map, + p_in, p_out, + p_keep, keep_size, size); +} + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::stride_insert(const shared_ptr>& in_coords_map, + const vector& tensor_strides, + int size) { + map->IterateStrideInsert(in_coords_map->map, tensor_strides, size); + nrows = map->Size(); + return nrows; +} + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::stride_insert_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, + const vector& tensor_strides, + int size) { + map->IterateStrideInsertWithInOut(in_coords_map->map, + p_in, p_out, + tensor_strides, size); + nrows = map->Size(); + return nrows; +} + +template +void +GPUCoordsMap::stride_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, + const vector& tensor_strides, + int size) { + map->IterateStrideSearch(in_coords_map->map, + p_in, p_out, + tensor_strides, size); +} + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::union_insert( + const vector>>& in_maps, + const vector& in_coords_map_sizes) { + for (size_t i = 0; i != in_maps.size(); ++i) { + map->IterateInsert(in_maps[i]->map, + in_coords_map_sizes[i]); + } + nrows = map->Size(); + return nrows; +} + +template +typename GPUCoordsMap::value_type +GPUCoordsMap::union_insert_search( + const vector>>& in_maps, + const vector& p_ins, const vector& p_outs, + const vector& in_coords_map_sizes) { + for (size_t i = 0; i != in_maps.size(); ++i) { + map->IterateInsertWithInsOuts(in_maps[i]->map, p_ins[i], p_outs[i], + in_coords_map_sizes[i]); + } + nrows = map->Size(); + return nrows; +} + +template +void GPUCoordsMap::union_search( + const vector>>& in_maps, + const vector& p_ins, const vector& p_outs, + const vector& in_coords_map_sizes) { + for (size_t i = 0; i != in_maps.size(); ++i) { + map->IterateSearch(in_maps[i]->map, p_ins[i], p_outs[i], + in_coords_map_sizes[i]); + } +} + +// TODO(ljm): add a debug helper function here +/* +template void GPUCoordsMap::print() const { + for (const auto &kv : map) { + std::cout << ArrToString(kv.first) << ":" << kv.second << "\n"; + } + std::cout << std::flush; +} +*/ + +template struct GPUCoordsMap; +//template struct GPUCoordsMap; + +} // end namespace minkowski diff --git a/src/gpu_coordsmap.hpp b/src/gpu_coordsmap.hpp new file mode 100644 index 00000000..a55fcb4e --- /dev/null +++ b/src/gpu_coordsmap.hpp @@ -0,0 +1,260 @@ +/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu). + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural + * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part + * of the code. + */ +#ifndef GPU_COORDSMAP +#define GPU_COORDSMAP + +#include +#include +#include +#include +#include + +#include "3rdparty/gpu_coords_map/include/cuda_unordered_map.h" +#include "3rdparty/gpu_coords_map/include/coordinate.h" + +#include "region.hpp" +#include "types.hpp" + +namespace minkowski { + +using std::reference_wrapper; +using std::set; +using std::tuple; +using std::vector; +using std::shared_ptr; + +// TODO(ljm): enumerate and `DISPATCH` all possible combination +// D = 3 +using CoordsToIndexMap_int_4_int_5_0 = + cuda::unordered_map, int, 5, 0>; + +using CoordsToIndexMap_int_4_int_5_1 = + cuda::unordered_map, int, 5, 1>; + +using CoordsToIndexMap_int_4_int_5_2 = + cuda::unordered_map, int, 5, 2>; + +using CoordsToIndexMap_int_4_int_5_3 = + cuda::unordered_map, int, 5, 3>; + +using CoordsToIndexMap_int_4_int_5_4 = + cuda::unordered_map, int, 5, 4>; + +using CoordsToIndexMap_int_4_int_5_5 = + cuda::unordered_map, int, 5, 5>; + +using CoordsToIndexMap_int_4_int_5_6 = + cuda::unordered_map, int, 5, 6>; + +using CoordsToIndexMap_int_4_int_5_7 = + cuda::unordered_map, int, 5, 7>; + +using CoordsToIndexMap_int_4_int_5_8 = + cuda::unordered_map, int, 5, 8>; + +// D = 4 +using CoordsToIndexMap_int_5_int_5_0 = + cuda::unordered_map, int, 5, 0>; + +using CoordsToIndexMap_int_5_int_5_1 = + cuda::unordered_map, int, 5, 1>; + +using CoordsToIndexMap_int_5_int_5_2 = + cuda::unordered_map, int, 5, 2>; + +using CoordsToIndexMap_int_5_int_5_3 = + cuda::unordered_map, int, 5, 3>; + +using CoordsToIndexMap_int_5_int_5_4 = + cuda::unordered_map, int, 5, 4>; + +using CoordsToIndexMap_int_5_int_5_5 = + cuda::unordered_map, int, 5, 5>; + +using CoordsToIndexMap_int_5_int_5_6 = + cuda::unordered_map, int, 5, 6>; + +using CoordsToIndexMap_int_5_int_5_7 = + cuda::unordered_map, int, 5, 7>; + +using CoordsToIndexMap_int_5_int_5_8 = + cuda::unordered_map, int, 5, 8>; + +// D = 5 +using CoordsToIndexMap_int_6_int_5_0 = + cuda::unordered_map, int, 5, 0>; + +using CoordsToIndexMap_int_6_int_5_1 = + cuda::unordered_map, int, 5, 1>; + +using CoordsToIndexMap_int_6_int_5_2 = + cuda::unordered_map, int, 5, 2>; + +using CoordsToIndexMap_int_6_int_5_3 = + cuda::unordered_map, int, 5, 3>; + +using CoordsToIndexMap_int_6_int_5_4 = + cuda::unordered_map, int, 5, 4>; + +using CoordsToIndexMap_int_6_int_5_5 = + cuda::unordered_map, int, 5, 5>; + +using CoordsToIndexMap_int_6_int_5_6 = + cuda::unordered_map, int, 5, 6>; + +using CoordsToIndexMap_int_6_int_5_7 = + cuda::unordered_map, int, 5, 7>; + +using CoordsToIndexMap_int_6_int_5_8 = + cuda::unordered_map, int, 5, 8>; + +// D = 6 +using CoordsToIndexMap_int_7_int_5_0 = + cuda::unordered_map, int, 5, 0>; + +using CoordsToIndexMap_int_7_int_5_1 = + cuda::unordered_map, int, 5, 1>; + +using CoordsToIndexMap_int_7_int_5_2 = + cuda::unordered_map, int, 5, 2>; + +using CoordsToIndexMap_int_7_int_5_3 = + cuda::unordered_map, int, 5, 3>; + +using CoordsToIndexMap_int_7_int_5_4 = + cuda::unordered_map, int, 5, 4>; + +using CoordsToIndexMap_int_7_int_5_5 = + cuda::unordered_map, int, 5, 5>; + +using CoordsToIndexMap_int_7_int_5_6 = + cuda::unordered_map, int, 5, 6>; + +using CoordsToIndexMap_int_7_int_5_7 = + cuda::unordered_map, int, 5, 7>; + +using CoordsToIndexMap_int_7_int_5_8 = + cuda::unordered_map, int, 5, 8>; + + +using CoordsToIndexMapGPU = CoordsToIndexMap_int_4_int_5_5; + +template struct GPUCoordsMap { + shared_ptr map; + using key_type = typename MapType::key_type; + using value_type = typename MapType::value_type; + int nrows, ncols; + + // Constructors + GPUCoordsMap(uint32_t map_size, float duplicate_factor=1.0, + uint32_t keys_per_bucket=62, const uint32_t device_id=0) { + /* + map->reserve(map_size, duplicate_factor, + keys_per_bucket, device_id); + */ + map = std::make_shared(map_size, duplicate_factor, + keys_per_bucket, device_id); + } + + // Initializations + value_type + initialize_batch(const int* p_coords_, + int* p_mapping_, + int* p_inverse_mapping_, + const int nrows_, const int ncols_, + const bool force_remap = false, + const bool return_inverse = false); + + void get_coords(int* p_coords, int size); + void get_index_at_batch(int* p_out, int batch_index, int nrows_); + void get_index_per_batch(const vector& p_outs, int nrows_); + value_type + //region_insert(const GPUCoordsMap& in_coords_map, + region_insert(const shared_ptr>& in_coords_map, + const Region ®ion, int size); + value_type + region_insert_search(const shared_ptr>& in_coords_map, + const vector& p_ins, + const vector& p_outs, + const Region ®ion, + int size); + void region_search(const shared_ptr>& in_coords_map, + const vector& p_ins, + const vector& p_outs, + const Region ®ion, + int size); + value_type + batch_insert(const shared_ptr>& in_coords_map, int size); + void batch_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, int size); + value_type + stride_insert(const shared_ptr>& in_coords_map, + const vector& tensor_strides, + int size); + value_type + stride_insert_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, + const vector& tensor_strides, + int size); + void stride_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, + const vector& tensor_strides, + int size); + value_type + union_insert( + const vector>>& in_maps, + const vector& in_coords_map_sizes); + value_type + union_insert_search( + const vector>>& in_maps, + const vector& p_ins, const vector& p_outs, + const vector& in_coords_map_sizes); + void union_search( + const vector>>& in_maps, + const vector& p_ins, const vector& p_outs, + const vector& in_coords_map_sizes); + value_type + prune_insert(const shared_ptr>& in_coords_map, + bool* p_keep, int keep_size, + int size); + value_type + prune_insert_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, + bool* p_keep, int keep_size, + int size); + void + prune_search(const shared_ptr>& in_coords_map, + int* p_in, int* p_out, + bool* p_keep, int keep_size, + int size); + size_t size() const { + ASSERT(map->Size() == nrows, "map->Size() should equal to nrows"); + return nrows; + } +}; + +} // end namespace minkowski + +#endif // gpu coordsmap diff --git a/src/gpu_memory_manager.hpp b/src/gpu_memory_manager.hpp index 255c32da..8fb4a5e9 100644 --- a/src/gpu_memory_manager.hpp +++ b/src/gpu_memory_manager.hpp @@ -77,6 +77,7 @@ class GPUMemoryManager { pInOutMaps copyInOutMapToGPU(const InOutMaps &map); +// TODO(ljm): support multi-thread here void clear_tmp() { for (auto p_buffer : tmp_vec_ptr) { cudaFree(p_buffer); @@ -107,10 +108,14 @@ class GPUMemoryManager { } case PYTORCH: { // std::cout << "Malloc PYTORCH: " << device_id << std::endl; + std::cout << "not support currently: " << device_id << std::endl; + // + /* CUDA_CHECK(cudaSetDevice(device_id)); p_buffer = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream( size, at::cuda::getCurrentCUDAStream()); persist_vec_ptr.push_back(p_buffer); + */ break; } } diff --git a/src/pooling_avg.cpp b/src/pooling_avg.cpp index 9fafb9d4..9c5780bf 100644 --- a/src/pooling_avg.cpp +++ b/src/pooling_avg.cpp @@ -101,9 +101,9 @@ void AvgPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); - const auto &in_out = p_coords_manager->getInOutMapsGPU( + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); + const InOutMapKey map_key = p_coords_manager->getInOutMaps( tensor_strides, strides, kernel_sizes, dilations, region_type, offsets, py_in_coords_key, py_out_coords_key, false, true); @@ -124,7 +124,10 @@ void AvgPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat, NonzeroAvgPoolingForwardKernelGPU( in_feat.template data(), in_feat.size(0), out_feat.template data(), out_nrows, num_nonzero_data, - in_feat.size(1), in_out.first, in_out.second, use_avg, handle, + in_feat.size(1), + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + use_avg, handle, at::cuda::getCurrentCUDAStream()); } @@ -136,8 +139,8 @@ void AvgPoolingBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); const InOutMapKey map_key = p_coords_manager->getMapHashKey( tensor_strides, strides, kernel_sizes, dilations, region_type, py_in_coords_key, py_out_coords_key, false, true); @@ -153,8 +156,8 @@ void AvgPoolingBackwardGPU(at::Tensor in_feat, at::Tensor grad_in_feat, grad_in_feat.template data(), in_feat.size(0), grad_out_feat.template data(), grad_out_feat.size(0), num_nonzero.template data(), in_feat.size(1), - p_coords_manager->d_in_maps[map_key], - p_coords_manager->d_out_maps[map_key], use_avg, + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], use_avg, at::cuda::getCurrentCUDAStream()); } #endif @@ -188,28 +191,28 @@ template void AvgPoolingBackwardCPU( py::object py_coords_manager, bool use_avg); #ifndef CPU_ONLY -template void AvgPoolingForwardGPU( +template void AvgPoolingForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg); -template void AvgPoolingForwardGPU( +template void AvgPoolingForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg); -template void AvgPoolingBackwardGPU( +template void AvgPoolingBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg); -template void AvgPoolingBackwardGPU( +template void AvgPoolingBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, diff --git a/src/pooling_avg.cu b/src/pooling_avg.cu index 1f1865a2..5ad34ccb 100644 --- a/src/pooling_avg.cu +++ b/src/pooling_avg.cu @@ -116,8 +116,7 @@ template void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows, Dtype *d_out_feat, int out_nrows, Dtype *d_num_nonzero, int nchannel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, bool use_avg, cusparseHandle_t cushandle, cudaStream_t stream) { int nmaps = 0; @@ -129,7 +128,7 @@ void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows, // Copy all maps to one vector for (const auto &map : in_maps) - nmaps += map.size(); + nmaps += map.size(0); /* Map prep */ // Create d in map @@ -139,10 +138,10 @@ void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows, d_out_map = d_scr + nmaps; // n_maps d_csr_row = d_scr + 2 * nmaps; // out_nrows + 1 - CUDA_CHECK(cudaMemcpy(d_in_map, in_maps[0].data(), nmaps * sizeof(int), + CUDA_CHECK(cudaMemcpy(d_in_map, in_maps[0].data(), nmaps * sizeof(int), cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(d_out_map, out_maps[0].data(), nmaps * sizeof(int), + CUDA_CHECK(cudaMemcpy(d_out_map, out_maps[0].data(), nmaps * sizeof(int), cudaMemcpyDeviceToDevice)); /* sparse mm prep */ @@ -235,38 +234,40 @@ void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows, template void NonzeroAvgPoolingForwardKernelGPU( const float *d_in_feat, int in_nrows, float *d_out_feat, int out_nrows, - float *d_num_nonzero, int nchannel, const pInOutMaps &in_map, - const pInOutMaps &out_map, bool use_avg, + float *d_num_nonzero, int nchannel, + const vector& in_maps, const vector& out_maps, + bool use_avg, cusparseHandle_t cushandle, cudaStream_t stream); template void NonzeroAvgPoolingForwardKernelGPU( const double *d_in_feat, int in_nrows, double *d_out_feat, int out_nrows, - double *d_num_nonzero, int nchannel, const pInOutMaps &in_map, - const pInOutMaps &out_map, bool use_avg, + double *d_num_nonzero, int nchannel, + const vector& in_maps, const vector& out_maps, + bool use_avg, cusparseHandle_t cushandle, cudaStream_t stream); template void NonzeroAvgPoolingBackwardKernelGPU( Dtype *d_grad_in_feat, int in_nrows, const Dtype *d_grad_out_feat, int out_nrows, const Dtype *d_num_nonzero, int nchannel, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, bool use_avg, cudaStream_t stream) { // d_grad_in_feat must be all set to 0 int nmaps = 0; for (const auto &map : in_maps) - nmaps += map.size(); + nmaps += map.size(0); if (use_avg) { set_gradient_nonzero_avg <<>>( nmaps * nchannel, d_grad_out_feat, d_grad_in_feat, nchannel, - d_num_nonzero, in_maps[0].data(), out_maps[0].data()); + d_num_nonzero, in_maps[0].data(), out_maps[0].data()); } else { set_gradient_nonzero <<>>( nmaps * nchannel, d_grad_out_feat, d_grad_in_feat, nchannel, - in_maps[0].data(), out_maps[0].data()); + in_maps[0].data(), out_maps[0].data()); } CUDA_CHECK(cudaGetLastError()); @@ -276,13 +277,13 @@ void NonzeroAvgPoolingBackwardKernelGPU( template void NonzeroAvgPoolingBackwardKernelGPU( float *d_grad_in_feat, int in_nrows, const float *d_grad_out_feat, int out_nrows, const float *d_num_nonzero, int nchannel, - const pInOutMaps &in_map, const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, bool use_avg, cudaStream_t stream); template void NonzeroAvgPoolingBackwardKernelGPU( double *d_grad_in_feat, int in_nrows, const double *d_grad_out_feat, int out_nrows, const double *d_num_nonzero, int nchannel, - const pInOutMaps &in_map, const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, bool use_avg, cudaStream_t stream); } // end namespace minkowski diff --git a/src/pooling_avg.cuh b/src/pooling_avg.cuh index 1ee83050..eeaeca12 100644 --- a/src/pooling_avg.cuh +++ b/src/pooling_avg.cuh @@ -27,6 +27,7 @@ #include #include +#include #include "gpu.cuh" #include "math_functions.hpp" @@ -38,8 +39,7 @@ template void NonzeroAvgPoolingForwardKernelGPU(const Dtype *d_in_feat, int in_nrows, Dtype *d_out_feat, int out_nrows, Dtype *d_num_nonzero, int nchannel, - const pInOutMaps &in_map, - const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, bool use_avg, cusparseHandle_t cushandle, cudaStream_t stream); @@ -47,7 +47,7 @@ template void NonzeroAvgPoolingBackwardKernelGPU( Dtype *d_grad_in_feat, int in_nrows, const Dtype *d_grad_out_feat, int out_nrows, const Dtype *d_num_nonzero, int nchannel, - const pInOutMaps &in_map, const pInOutMaps &out_map, + const vector& in_maps, const vector& out_maps, bool use_avg, cudaStream_t stream); } // end namespace minkowski diff --git a/src/pooling_global_avg.cpp b/src/pooling_global_avg.cpp index 1c648aac..6f1feff3 100644 --- a/src/pooling_global_avg.cpp +++ b/src/pooling_global_avg.cpp @@ -134,8 +134,8 @@ vector GlobalPoolingForwardGPU(at::Tensor in_feat, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg, int pooling_mode) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); const auto batch_size = p_coords_manager->getBatchSize(); if (batch_size == 1) { @@ -173,7 +173,7 @@ vector GlobalPoolingForwardGPU(at::Tensor in_feat, } } break; case 2: { - const auto &in_outs = p_coords_manager->getOriginInOutMapsGPU( + const InOutMapKey map_key = p_coords_manager->getOriginInOutMaps( py_in_coords_key, py_out_coords_key); cusparseHandle_t handle = at::cuda::getCurrentCUDASparseHandle(); @@ -182,8 +182,10 @@ vector GlobalPoolingForwardGPU(at::Tensor in_feat, NonzeroAvgPoolingForwardKernelGPU( in_feat.template data(), in_feat.size(0), out_feat.template data(), batch_size, - num_nonzero.template data(), in_feat.size(1), in_outs.first, - in_outs.second, use_avg, handle, at::cuda::getCurrentCUDAStream()); + num_nonzero.template data(), in_feat.size(1), + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + use_avg, handle, at::cuda::getCurrentCUDAStream()); } break; default: @@ -199,8 +201,8 @@ GlobalPoolingBackwardGPU(at::Tensor in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg) { - CoordsManager *p_coords_man = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_man = + py_coords_manager.cast *>(); const auto batch_size = p_coords_man->getBatchSize(); auto grad_in_feat = torch::empty_like(in_feat); @@ -218,15 +220,13 @@ GlobalPoolingBackwardGPU(at::Tensor in_feat, at::Tensor grad_out_feat, p_coords_man->existsInOutMapKey(map_key), "The in-out map doesn't exist for backward. Did you run forward pass?"); - p_coords_man->copyInOutMapsToGPU(map_key); - grad_in_feat.zero_(); NonzeroAvgPoolingBackwardKernelGPU( grad_in_feat.template data(), in_feat.size(0), grad_out_feat.template data(), grad_out_feat.size(0), num_nonzero.template data(), in_feat.size(1), - p_coords_man->d_in_maps[map_key], p_coords_man->d_out_maps[map_key], + p_coords_man->in_maps[map_key], p_coords_man->out_maps[map_key], use_avg, at::cuda::getCurrentCUDAStream()); } return grad_in_feat; @@ -254,22 +254,22 @@ template at::Tensor GlobalPoolingBackwardCPU( py::object py_coords_manager, bool use_avg); #ifndef CPU_ONLY -template vector GlobalPoolingForwardGPU( +template vector GlobalPoolingForwardGPU( at::Tensor in_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg, int pooling_mode); -template vector GlobalPoolingForwardGPU( +template vector GlobalPoolingForwardGPU( at::Tensor in_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg, int pooling_mode); -template at::Tensor GlobalPoolingBackwardGPU( +template at::Tensor GlobalPoolingBackwardGPU( at::Tensor in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg); -template at::Tensor GlobalPoolingBackwardGPU( +template at::Tensor GlobalPoolingBackwardGPU( at::Tensor in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager, bool use_avg); diff --git a/src/pooling_global_max.cpp b/src/pooling_global_max.cpp index dec5d937..09437fe0 100644 --- a/src/pooling_global_max.cpp +++ b/src/pooling_global_max.cpp @@ -86,9 +86,9 @@ void GlobalMaxPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); - const auto &in_out = p_coords_manager->getOriginInOutMapsGPU( + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); + const InOutMapKey map_key = p_coords_manager->getOriginInOutMaps( py_in_coords_key, py_out_coords_key); const int out_nrows = p_coords_manager->getCoordsSize(py_out_coords_key); @@ -99,14 +99,16 @@ void GlobalMaxPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat, num_nonzero.zero_(); // Compute the scratch space - const int nmap = getInOutMapsSize(in_out.first); + const int nmap = getInOutMapsSizeGPU(p_coords_manager->in_maps[map_key]); int *d_scr = (int *)p_coords_manager->getScratchGPUMemory(5 * nmap * sizeof(int)); MaxPoolingForwardKernelGPU( in_feat.template data(), out_feat.template data(), - out_nrows, num_nonzero.template data(), nchannel, get<0>(in_out), - get<1>(in_out), d_scr, at::cuda::getCurrentCUDAStream()); + out_nrows, num_nonzero.template data(), nchannel, + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + d_scr, at::cuda::getCurrentCUDAStream()); p_coords_manager->clearScratchGPUMemory(); } @@ -150,22 +152,22 @@ template void GlobalMaxPoolingBackwardCPU( py::object py_out_coords_key, py::object py_coords_manager); #ifndef CPU_ONLY -template void GlobalMaxPoolingForwardGPU( +template void GlobalMaxPoolingForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void GlobalMaxPoolingForwardGPU( +template void GlobalMaxPoolingForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void GlobalMaxPoolingBackwardGPU( +template void GlobalMaxPoolingBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void GlobalMaxPoolingBackwardGPU( +template void GlobalMaxPoolingBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); diff --git a/src/pooling_max.cpp b/src/pooling_max.cpp index 7ac88719..7d9a9c64 100644 --- a/src/pooling_max.cpp +++ b/src/pooling_max.cpp @@ -96,9 +96,9 @@ void MaxPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); - const auto &in_out = p_coords_manager->getInOutMapsGPU( + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); + const InOutMapKey map_key = p_coords_manager->getInOutMaps( tensor_strides, strides, kernel_sizes, dilations, region_type, offsets, py_in_coords_key, py_out_coords_key, false, true); @@ -110,14 +110,16 @@ void MaxPoolingForwardGPU(at::Tensor in_feat, at::Tensor out_feat, num_nonzero.zero_(); // Compute the scratch space - int nmap = getInOutMapsSize(in_out.first); + int nmap = getInOutMapsSizeGPU(p_coords_manager->in_maps[map_key]); int *d_scr = (int *)p_coords_manager->getScratchGPUMemory(5 * nmap * sizeof(int)); MaxPoolingForwardKernelGPU( in_feat.template data(), out_feat.template data(), - out_nrows, num_nonzero.data(), nchannel, in_out.first, in_out.second, + out_nrows, num_nonzero.data(), nchannel, + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], d_scr, at::cuda::getCurrentCUDAStream()); p_coords_manager->clearScratchGPUMemory(); @@ -172,28 +174,28 @@ template void MaxPoolingBackwardCPU( #ifndef CPU_ONLY -template void MaxPoolingForwardGPU( +template void MaxPoolingForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void MaxPoolingForwardGPU( +template void MaxPoolingForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void MaxPoolingBackwardGPU( +template void MaxPoolingBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void MaxPoolingBackwardGPU( +template void MaxPoolingBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, diff --git a/src/pooling_max.cu b/src/pooling_max.cu index 7861fe32..61e1424b 100644 --- a/src/pooling_max.cu +++ b/src/pooling_max.cu @@ -102,24 +102,24 @@ namespace minkowski { template void MaxPoolingForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat, int out_nrows, Itype *d_max_index, int nchannel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, Itype *d_scr, + const vector& in_maps, const vector& out_maps, + Itype *d_scr, cudaStream_t stream) { int nmap = 0; // Copy all maps to one vector for (const auto &map : in_maps) - nmap += map.size(); + nmap += map.size(0); Itype *d_in_map = d_scr, *d_out_map = d_scr + nmap; CUDA_CHECK(cudaMemcpy(d_in_map, - in_maps[0].data(), // in_maps are contiguous of size nnz + in_maps[0].data(), // in_maps are contiguous of size nnz nmap * sizeof(int), cudaMemcpyDeviceToDevice)); CUDA_CHECK( cudaMemcpy(d_out_map, - out_maps[0].data(), // out_maps are contiguous of size nnz + out_maps[0].data(), // out_maps are contiguous of size nnz nmap * sizeof(int), cudaMemcpyDeviceToDevice)); // First, sort d_out_map and d_in_map with the d_out_map so that in_feat are @@ -171,13 +171,15 @@ void MaxPoolingForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat, template void MaxPoolingForwardKernelGPU( const float *d_in_feat, float *d_out_feat, int out_nrows, - int32_t *d_max_index, int nchannel, const pInOutMaps &in_map, - const pInOutMaps &out_map, int32_t *d_scr, cudaStream_t stream); + int32_t *d_max_index, int nchannel, + const vector& in_maps, const vector& out_maps, + int32_t *d_scr, cudaStream_t stream); template void MaxPoolingForwardKernelGPU( const double *d_in_feat, double *d_out_feat, int out_nrows, - int32_t *d_max_index, int nchannel, const pInOutMaps &in_map, - const pInOutMaps &out_map, int32_t *d_scr, cudaStream_t stream); + int32_t *d_max_index, int nchannel, + const vector& in_maps, const vector& out_maps, + int32_t *d_scr, cudaStream_t stream); template void MaxPoolingBackwardKernelGPU(Dtype *d_grad_in_feat, int in_nrows, diff --git a/src/pooling_max.cuh b/src/pooling_max.cuh index 684e4e8a..62066a69 100644 --- a/src/pooling_max.cuh +++ b/src/pooling_max.cuh @@ -27,6 +27,7 @@ #include #include +#include #include "gpu.cuh" #include "math_functions.hpp" @@ -37,8 +38,8 @@ namespace minkowski { template void MaxPoolingForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat, int out_nrows, Itype *d_max_index, int nchannel, - const pInOutMaps &in_map, - const pInOutMaps &out_map, Itype *d_scr, + const vector& in_maps, const vector& out_maps, + Itype *d_scr, cudaStream_t stream); template diff --git a/src/pooling_transpose.cpp b/src/pooling_transpose.cpp index a32c6aa7..45a0f496 100644 --- a/src/pooling_transpose.cpp +++ b/src/pooling_transpose.cpp @@ -122,9 +122,9 @@ void PoolingTransposeForwardGPU(at::Tensor in_feat, at::Tensor out_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); - const auto &in_out = p_coords_manager->getInOutMapsGPU( + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); + const InOutMapKey map_key = p_coords_manager->getInOutMaps( tensor_strides, strides, kernel_sizes, dilations, region_type, offsets, py_in_coords_key, py_out_coords_key, true, true); @@ -140,8 +140,10 @@ void PoolingTransposeForwardGPU(at::Tensor in_feat, at::Tensor out_feat, NonzeroAvgPoolingForwardKernelGPU( in_feat.template data(), in_feat.size(0), out_feat.template data(), out_nrows, - num_nonzero.template data(), in_feat.size(1), get<0>(in_out), - get<1>(in_out), false, handle, at::cuda::getCurrentCUDAStream()); + num_nonzero.template data(), in_feat.size(1), + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + false, handle, at::cuda::getCurrentCUDAStream()); } template @@ -151,8 +153,8 @@ void PoolingTransposeBackwardGPU( vector kernel_sizes, vector dilations, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); bool reverse_map = false; const InOutMapKey rev_map_key = p_coords_manager->getMapHashKey( tensor_strides, strides, kernel_sizes, dilations, region_type, @@ -171,29 +173,29 @@ void PoolingTransposeBackwardGPU( if (!reverse_map) { ASSERT( - p_coords_manager->d_in_maps.find(map_key) != - p_coords_manager->d_in_maps.end(), + p_coords_manager->in_maps.find(map_key) != + p_coords_manager->in_maps.end(), "The in-out map doesn't exist for backward. Did you run forward pass?"); NonzeroAvgPoolingBackwardKernelGPU( grad_in_feat.template data(), in_feat.size(0), grad_out_feat.template data(), grad_out_feat.size(0), num_nonzero.template data(), in_feat.size(1), - p_coords_manager->d_in_maps[map_key], - p_coords_manager->d_out_maps[map_key], false, + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], false, at::cuda::getCurrentCUDAStream()); } else { ASSERT( - p_coords_manager->d_in_maps.find(rev_map_key) != - p_coords_manager->d_in_maps.end(), + p_coords_manager->in_maps.find(rev_map_key) != + p_coords_manager->in_maps.end(), "The in-out map doesn't exist for backward. Did you run forward pass?"); NonzeroAvgPoolingBackwardKernelGPU( grad_in_feat.template data(), in_feat.size(0), grad_out_feat.template data(), grad_out_feat.size(0), num_nonzero.template data(), in_feat.size(1), - p_coords_manager->d_out_maps[rev_map_key], - p_coords_manager->d_in_maps[rev_map_key], false, + p_coords_manager->out_maps[rev_map_key], + p_coords_manager->in_maps[rev_map_key], false, at::cuda::getCurrentCUDAStream()); } } @@ -229,28 +231,28 @@ template void PoolingTransposeBackwardCPU( #ifndef CPU_ONLY -template void PoolingTransposeForwardGPU( +template void PoolingTransposeForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void PoolingTransposeForwardGPU( +template void PoolingTransposeForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, at::Tensor offsets, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void PoolingTransposeBackwardGPU( +template void PoolingTransposeBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void PoolingTransposeBackwardGPU( +template void PoolingTransposeBackwardGPU( at::Tensor in_feat, at::Tensor grad_in_feat, at::Tensor grad_out_feat, at::Tensor num_nonzero, vector tensor_strides, vector strides, vector kernel_sizes, vector dilations, int region_type, diff --git a/src/pruning.cpp b/src/pruning.cpp index 5ca31e8a..98f7eeac 100644 --- a/src/pruning.cpp +++ b/src/pruning.cpp @@ -98,9 +98,9 @@ void PruningForwardGPU(at::Tensor in_feat, // GPU feat py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); - const auto &in_out = p_coords_manager->getPruningInOutMapsGPU( + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); + const InOutMapKey map_key = p_coords_manager->getPruningInOutMaps( use_feat, py_in_coords_key, py_out_coords_key); // Get the total number of coords @@ -115,7 +115,9 @@ void PruningForwardGPU(at::Tensor in_feat, // GPU feat PruningForwardKernelGPU( in_feat.template data(), out_feat.template data(), - in_feat.size(1), get<0>(in_out), get<1>(in_out), + in_feat.size(1), + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], at::cuda::getCurrentCUDAStream()); } } @@ -126,14 +128,14 @@ void PruningBackwardGPU(at::Tensor grad_in_feat, // GPU feat py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); const InOutMapKey map_key = p_coords_manager->getOriginMapHashKey( py_in_coords_key, py_out_coords_key); - ASSERT(p_coords_manager->d_in_maps.find(map_key) != - p_coords_manager->d_in_maps.end(), + ASSERT(p_coords_manager->in_maps.find(map_key) != + p_coords_manager->in_maps.end(), "The in-out map doesn't exist for backward. Did you run forward pass?") const int in_nrows = p_coords_manager->getCoordsSize(py_in_coords_key); @@ -145,8 +147,8 @@ void PruningBackwardGPU(at::Tensor grad_in_feat, // GPU feat PruningBackwardKernelGPU(grad_in_feat.template data(), grad_out_feat.template data(), nchannel, - p_coords_manager->d_in_maps[map_key], - p_coords_manager->d_out_maps[map_key], + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], at::cuda::getCurrentCUDAStream()); else WARNING(true, "MinkowskiPruning: Backprop from a size-0 sparse tensor."); @@ -174,22 +176,22 @@ template void PruningBackwardCPU( py::object py_coords_manager); #ifndef CPU_ONLY -template void PruningForwardGPU( +template void PruningForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor use_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void PruningForwardGPU( +template void PruningForwardGPU( at::Tensor in_feat, at::Tensor out_feat, at::Tensor use_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void PruningBackwardGPU( +template void PruningBackwardGPU( at::Tensor grad_in_feat, at::Tensor grad_out_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); -template void PruningBackwardGPU( +template void PruningBackwardGPU( at::Tensor grad_in_feat, at::Tensor grad_out_feat, py::object py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); diff --git a/src/pruning.cu b/src/pruning.cu index baff628e..6ec0aeb6 100644 --- a/src/pruning.cu +++ b/src/pruning.cu @@ -46,49 +46,47 @@ __global__ void copy_in_out_map(const int n, const Dtype *in_feat, template void PruningForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat, const int nchannel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream) { - const int nnz = in_maps[0].size(); + const int nnz = in_maps[0].size(0); copy_in_out_map <<>>( - nnz, d_in_feat, d_out_feat, nchannel, in_maps[0].data(), - out_maps[0].data()); + nnz, d_in_feat, d_out_feat, nchannel, in_maps[0].data(), + out_maps[0].data()); } template void PruningBackwardKernelGPU(Dtype *d_grad_in_feat, const Dtype *d_grad_out_feat, int nchannel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream) { - const int nnz = in_maps[0].size(); + const int nnz = in_maps[0].size(0); copy_in_out_map <<>>( - nnz, d_grad_out_feat, d_grad_in_feat, nchannel, out_maps[0].data(), - in_maps[0].data()); + nnz, d_grad_out_feat, d_grad_in_feat, nchannel, out_maps[0].data(), + in_maps[0].data()); } template void PruningForwardKernelGPU( const float *d_in_feat, float *d_out_feat, int nchannel, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); template void PruningBackwardKernelGPU( float *d_grad_in_feat, const float *d_grad_out_feat, int nchannel, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); template void PruningForwardKernelGPU( const double *d_in_feat, double *d_out_feat, int nchannel, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); template void PruningBackwardKernelGPU( double *d_grad_in_feat, const double *d_grad_out_feat, int nchannel, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); } // end namespace minkowski diff --git a/src/pruning.cuh b/src/pruning.cuh index 9ac6ed32..3399c288 100644 --- a/src/pruning.cuh +++ b/src/pruning.cuh @@ -27,6 +27,7 @@ #include #include +#include #include "types.hpp" @@ -34,15 +35,14 @@ namespace minkowski { template void PruningForwardKernelGPU(const Dtype *d_in_feat, Dtype *d_out_feat, - int nchannel, const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + int nchannel, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); template void PruningBackwardKernelGPU(Dtype *d_grad_in_feat, const Dtype *d_grad_out_feat, int nchannel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); } // end namespace minkowski diff --git a/src/quantization.cpp b/src/quantization.cpp index a1060bf7..65cae94a 100644 --- a/src/quantization.cpp +++ b/src/quantization.cpp @@ -230,21 +230,6 @@ template InOutMaps CopyToInOutMap(at::Tensor th_map) { return vec_map; } -#ifndef CPU_ONLY -template -pInOutMaps CopyToInOutMapGPU(at::Tensor th_map) { - pInOutMaps vec_map; - - Dtype *d_scr; - CUDA_CHECK(cudaMalloc(&d_scr, th_map.size(0) * sizeof(Dtype))); - CUDA_CHECK(cudaMemcpy(d_scr, th_map.template data(), - th_map.size(0) * sizeof(Dtype), - cudaMemcpyHostToDevice)); - vec_map.push_back(pVector(d_scr, th_map.size(0))); - return vec_map; -} -#endif - /** * A collection of feature averaging methods * mode == 0: non-weighted average @@ -278,22 +263,19 @@ at::Tensor quantization_average_features( if (th_in_map.dtype() == torch::kInt64) { if (th_in_feat.is_cuda()) { #ifndef CPU_ONLY - auto vec_in_map = CopyToInOutMapGPU(th_in_map); - auto vec_out_map = CopyToInOutMapGPU(th_out_map); - if (th_in_feat.dtype() == torch::kFloat32) { NonzeroAvgPoolingForwardKernelGPU( th_in_feat.template data(), th_in_feat.size(0), th_out_feat.template data(), out_nrows, th_num_nonzero.template data(), th_in_feat.size(1), - vec_in_map, vec_out_map, true, handle, + {th_in_map}, {th_out_map}, true, handle, at::cuda::getCurrentCUDAStream()); } else if (th_in_feat.dtype() == torch::kFloat64) { NonzeroAvgPoolingForwardKernelGPU( th_in_feat.template data(), th_in_feat.size(0), th_out_feat.template data(), out_nrows, th_num_nonzero.template data(), th_in_feat.size(1), - vec_in_map, vec_out_map, true, handle, + {th_in_map}, {th_out_map}, true, handle, at::cuda::getCurrentCUDAStream()); } else { throw std::runtime_error("Dtype not supported."); @@ -324,22 +306,19 @@ at::Tensor quantization_average_features( } else if (th_in_map.dtype() == torch::kInt32) { if (th_in_feat.is_cuda()) { #ifndef CPU_ONLY - auto vec_in_map = CopyToInOutMapGPU(th_in_map); - auto vec_out_map = CopyToInOutMapGPU(th_out_map); - if (th_in_feat.dtype() == torch::kFloat32) { NonzeroAvgPoolingForwardKernelGPU( th_in_feat.template data(), th_in_feat.size(0), th_out_feat.template data(), out_nrows, th_num_nonzero.template data(), th_in_feat.size(1), - vec_in_map, vec_out_map, true, handle, + {th_in_map}, {th_out_map}, true, handle, at::cuda::getCurrentCUDAStream()); } else if (th_in_feat.dtype() == torch::kFloat64) { NonzeroAvgPoolingForwardKernelGPU( th_in_feat.template data(), th_in_feat.size(0), th_out_feat.template data(), out_nrows, th_num_nonzero.template data(), th_in_feat.size(1), - vec_in_map, vec_out_map, true, handle, + {th_in_map}, {th_out_map}, true, handle, at::cuda::getCurrentCUDAStream()); } else { throw std::runtime_error("Dtype not supported."); diff --git a/src/region.hpp b/src/region.hpp index 921f92ce..27912055 100644 --- a/src/region.hpp +++ b/src/region.hpp @@ -47,6 +47,7 @@ class RegionIterator { vector operator*() { return point; }; }; +// TODO(ljm): remove stride, stride will not affect region class Region { public: Region(const Region ®ion_); diff --git a/src/union.cpp b/src/union.cpp index 5dd933f2..d4336df9 100644 --- a/src/union.cpp +++ b/src/union.cpp @@ -119,8 +119,8 @@ at::Tensor UnionForwardGPU(vector in_feats, vector py_in_coords_keys, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); // Basic assertions ASSERT(in_feats.size() > 1, "The number of input tensors must be > 1."); const size_t n_in = in_feats.size(); @@ -139,7 +139,7 @@ at::Tensor UnionForwardGPU(vector in_feats, } // Create new out map and get the in-out map - const auto in_out = p_coords_manager->getUnionInOutMapsGPU(py_in_coords_keys, + const InOutMapKey map_key = p_coords_manager->getUnionInOutMaps(py_in_coords_keys, py_out_coords_key); // Out feat memory alloc @@ -155,7 +155,9 @@ at::Tensor UnionForwardGPU(vector in_feats, UnionForwardKernelGPU( p_in_feats, out_feat.template data(), in_feats[0].size(1), - in_out.first, in_out.second, at::cuda::getCurrentCUDAStream()); + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], + at::cuda::getCurrentCUDAStream()); return out_feat; } @@ -164,8 +166,8 @@ template vector UnionBackwardGPU(at::Tensor grad_out_feat, vector py_in_coords_keys, py::object py_out_coords_key, py::object py_coords_manager) { - CoordsManager *p_coords_manager = - py_coords_manager.cast *>(); + GPUCoordsManager *p_coords_manager = + py_coords_manager.cast *>(); const int nchannel = grad_out_feat.size(1); const size_t n_in = py_in_coords_keys.size(); @@ -192,8 +194,8 @@ UnionBackwardGPU(at::Tensor grad_out_feat, vector py_in_coords_keys, UnionBackwardKernelGPU( p_grad_in_feats, grad_out_feat.template data(), nchannel, - p_coords_manager->d_in_maps[map_key], - p_coords_manager->d_out_maps[map_key], at::cuda::getCurrentCUDAStream()); + p_coords_manager->in_maps[map_key], + p_coords_manager->out_maps[map_key], at::cuda::getCurrentCUDAStream()); return grad_in_feats; } @@ -216,19 +218,19 @@ template vector UnionBackwardCPU( py::object py_out_coords_key, py::object py_coords_manager); #ifndef CPU_ONLY -template at::Tensor UnionForwardGPU( +template at::Tensor UnionForwardGPU( vector in_feats, vector py_in_coords_keys, py::object py_out_coords_key, py::object py_coords_manager); -template at::Tensor UnionForwardGPU( +template at::Tensor UnionForwardGPU( vector in_feats, vector py_in_coords_keys, py::object py_out_coords_key, py::object py_coords_manager); -template vector UnionBackwardGPU( +template vector UnionBackwardGPU( at::Tensor grad_out_feat, vector py_in_coords_keys, py::object py_out_coords_key, py::object py_coords_manager); -template vector UnionBackwardGPU( +template vector UnionBackwardGPU( at::Tensor grad_out_feat, vector py_in_coords_key, py::object py_out_coords_key, py::object py_coords_manager); diff --git a/src/union.cu b/src/union.cu index c23f672a..ebd3058a 100644 --- a/src/union.cu +++ b/src/union.cu @@ -23,7 +23,7 @@ * of the code. */ #include "gpu.cuh" -#include "pruning.cuh" +#include "union.cuh" namespace minkowski { @@ -62,53 +62,53 @@ __global__ void copy_in_out_map(const int n, const Dtype *in_feat, template void UnionForwardKernelGPU(const vector d_in_feats, Dtype *d_out_feat, - const int nchannel, const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const int nchannel, + const vector& in_maps, const vector& out_maps, cudaStream_t stream) { for (size_t k = 0; k < in_maps.size(); k++) { - const size_t nnz = in_maps[k].size(); + const size_t nnz = in_maps[k].size(0); add_in_out_map <<>>( - nnz, d_in_feats[k], d_out_feat, nchannel, in_maps[k].data(), - out_maps[k].data()); + nnz, d_in_feats[k], d_out_feat, nchannel, in_maps[k].data(), + out_maps[k].data()); } } template void UnionBackwardKernelGPU(vector d_grad_in_feats, const Dtype *d_grad_out_feat, int nchannel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream) { for (size_t k = 0; k < in_maps.size(); k++) { - const int nnz = in_maps[k].size(); + const int nnz = in_maps[k].size(0); copy_in_out_map <<>>( nnz, d_grad_out_feat, d_grad_in_feats[k], nchannel, - out_maps[k].data(), in_maps[k].data()); + out_maps[k].data(), in_maps[k].data()); } } template void UnionForwardKernelGPU( const vector d_in_feats, float *d_out_feat, int nchannel, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); template void UnionBackwardKernelGPU( vector d_grad_in_feats, const float *d_grad_out_feat, int nchannel, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); template void UnionForwardKernelGPU( const vector d_in_feats, double *d_out_feat, int nchannel, - const pInOutMaps &in_maps, const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); template void UnionBackwardKernelGPU( vector d_grad_in_feats, const double *d_grad_out_feat, - int nchannel, const pInOutMaps &in_maps, - const pInOutMaps &out_maps, cudaStream_t stream); + int nchannel, + const vector& in_maps, const vector& out_maps, + cudaStream_t stream); } // end namespace minkowski diff --git a/src/union.cuh b/src/union.cuh index af4719f8..ceaeb98e 100644 --- a/src/union.cuh +++ b/src/union.cuh @@ -27,6 +27,7 @@ #include #include +#include #include "types.hpp" @@ -34,15 +35,14 @@ namespace minkowski { template void UnionForwardKernelGPU(const vector d_in_feats, Dtype *d_out_feat, - int nchannel, const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + int nchannel, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); template void UnionBackwardKernelGPU(vector d_grad_in_feats, const Dtype *d_grad_out_feat, int nchannel, - const pInOutMaps &in_maps, - const pInOutMaps &out_maps, + const vector& in_maps, const vector& out_maps, cudaStream_t stream); } // namespace minkowski