diff --git a/CMakeLists.txt b/CMakeLists.txt index a34a67f2..e3b8254f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,7 +200,6 @@ if(HNSWLIB_EXAMPLES) if(ENABLE_ASAN OR ENABLE_UBSAN) add_cxx_flags(-DHNSWLIB_USE_PREFETCH=0) endif() - add_cxx_flags(-Wall -Wextra -Wpedantic -Werror) # Unused functions in header files might still be used by other code diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index a04b2ed4..7663d731 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -19,12 +19,16 @@ typedef unsigned int tableint; constexpr tableint kInvalidInternalId = std::numeric_limits::max(); typedef unsigned int linklistsizeint; +static const size_t kCacheLineSize = 64; + template class HierarchicalNSW : public AlgorithmInterface { public: static const tableint MAX_LABEL_OPERATION_LOCKS = 65536; static const unsigned char DELETE_MARK = 0x01; + static const size_t kDefaultMaxElementsPerChunk = 10 * 1024; + size_t max_elements_{0}; mutable std::atomic cur_element_count{0}; // current number of elements size_t size_data_per_element_{0}; @@ -52,8 +56,8 @@ class HierarchicalNSW : public AlgorithmInterface { size_t size_links_level0_{0}; size_t offsetData_{0}, offsetLevel0_{0}, label_offset_{ 0 }; - char *data_level0_memory_{nullptr}; - char **linkLists_{nullptr}; + ChunkedArray data_level0_memory_; + ChunkedArray linkLists_; std::vector element_levels_; // keeps level of each element size_t data_size_{0}; @@ -75,6 +79,7 @@ class HierarchicalNSW : public AlgorithmInterface { std::mutex deleted_elements_lock; // lock for deleted_elements std::unordered_set deleted_elements; // contains internal ids of deleted elements + size_t num_elements_per_chunk_{kDefaultMaxElementsPerChunk}; HierarchicalNSW(SpaceInterface *s) { } @@ -85,8 +90,10 @@ class HierarchicalNSW : public AlgorithmInterface { const std::string &location, bool nmslib = false, size_t max_elements = 0, - bool allow_replace_deleted = false) - : allow_replace_deleted_(allow_replace_deleted) { + bool allow_replace_deleted = false, + size_t num_elements_per_chunk = kDefaultMaxElementsPerChunk) + : allow_replace_deleted_(allow_replace_deleted), + num_elements_per_chunk_(num_elements_per_chunk) { loadIndex(location, s, max_elements); } @@ -97,11 +104,13 @@ class HierarchicalNSW : public AlgorithmInterface { size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100, - bool allow_replace_deleted = false) + bool allow_replace_deleted = false, + size_t num_elements_per_chunk = kDefaultMaxElementsPerChunk) : label_op_locks_(MAX_LABEL_OPERATION_LOCKS), link_list_locks_(max_elements), element_levels_(max_elements), - allow_replace_deleted_(allow_replace_deleted) { + allow_replace_deleted_(allow_replace_deleted), + num_elements_per_chunk_(num_elements_per_chunk) { max_elements_ = max_elements; num_deleted_ = 0; data_size_ = s->get_data_size(); @@ -128,10 +137,11 @@ class HierarchicalNSW : public AlgorithmInterface { label_offset_ = size_links_level0_ + data_size_; offsetLevel0_ = 0; - data_level0_memory_ = (char *) malloc(max_elements_ * size_data_per_element_); - if (data_level0_memory_ == nullptr) { - HNSWLIB_THROW_RUNTIME_ERROR("Not enough memory to allocate for level 0"); - } + // Allocate 64 more bytes for each chunk so we can safely prefetch a + // cache line beyond the chunk. + data_level0_memory_ = ChunkedArray( + size_data_per_element_, num_elements_per_chunk_, max_elements, + kCacheLineSize); cur_element_count = 0; @@ -141,11 +151,10 @@ class HierarchicalNSW : public AlgorithmInterface { enterpoint_node_ = -1; maxlevel_ = -1; - linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); - if (linkLists_ == nullptr) { - HNSWLIB_THROW_RUNTIME_ERROR( - "Not enough memory: HierarchicalNSW failed to allocate linklists"); - } + linkLists_ = ChunkedArray( + /* element_byte_size= */ sizeof(void *), + num_elements_per_chunk_, max_elements, + /* chunk_padding_bytes= */ 0); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); mult_ = 1 / log(1.0 * M_); @@ -157,17 +166,22 @@ class HierarchicalNSW : public AlgorithmInterface { clear(); } + void setLinkListPtr(tableint internal_id, char* data) { + *reinterpret_cast(linkLists_[internal_id]) = data; + } + + char* getLinkListPtr(tableint internal_id) const { + return *reinterpret_cast(linkLists_[internal_id]); + } + void clear() { - free(data_level0_memory_); - data_level0_memory_ = nullptr; + data_level0_memory_.clear(); for (tableint i = 0; i < cur_element_count; i++) { - if (element_levels_[i] > 0) - free(linkLists_[i]); - } - if (linkLists_) { - free(linkLists_); + if (element_levels_[i] > 0) { + free(getLinkListPtr(i)); + } } - linkLists_ = nullptr; + linkLists_.clear(); cur_element_count = 0; visited_list_pool_.reset(nullptr); } @@ -195,7 +209,7 @@ class HierarchicalNSW : public AlgorithmInterface { inline labeltype getExternalLabel(tableint internal_id) const { labeltype return_label; - memcpy(&return_label, (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), sizeof(labeltype)); + memcpy(&return_label, data_level0_memory_[internal_id] + label_offset_, sizeof(labeltype)); return return_label; } @@ -212,20 +226,19 @@ class HierarchicalNSW : public AlgorithmInterface { inline void setExternalLabel(tableint internal_id, labeltype label) const { - memcpy((data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), &label, sizeof(labeltype)); + memcpy(data_level0_memory_[internal_id] + label_offset_, &label, sizeof(labeltype)); } inline labeltype *getExternalLabeLp(tableint internal_id) const { - return (labeltype *) (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_); + return (labeltype *) (data_level0_memory_[internal_id] + label_offset_); } inline char *getDataByInternalId(tableint internal_id) const { - return (data_level0_memory_ + internal_id * size_data_per_element_ + offsetData_); + return (data_level0_memory_[internal_id] + offsetData_); } - int getRandomLevel(double reverse_size) { std::uniform_real_distribution distribution(0.0, 1.0); double r = -log(distribution(level_generator_)) * reverse_size; @@ -285,24 +298,18 @@ class HierarchicalNSW : public AlgorithmInterface { } size_t size = getListCount((linklistsizeint*)data); tableint *datal = (tableint *) (data + 1); -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0); - _mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); - _mm_prefetch(getDataByInternalId(*datal), _MM_HINT_T0); - _mm_prefetch(getDataByInternalId(*(datal + 1)), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1)), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1) + kCacheLineSize), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(getDataByInternalId(*datal), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + 1)), _MM_HINT_T0); for (size_t j = 0; j < size; j++) { tableint candidate_id = *(datal + j); // if (candidate_id == 0) continue; -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch((char *) (visited_array + *(datal + j + 1)), _MM_HINT_T0); - _mm_prefetch(getDataByInternalId(*(datal + j + 1)), _MM_HINT_T0); -#endif -#endif + if (j + 1 < size) { + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(datal + j + 1)), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + j + 1)), _MM_HINT_T0); + } if (visited_array[candidate_id] == visited_array_tag) continue; visited_array[candidate_id] = visited_array_tag; char *currObj1 = (getDataByInternalId(candidate_id)); @@ -310,11 +317,7 @@ class HierarchicalNSW : public AlgorithmInterface { dist_t dist1 = fstdistfunc_(data_point, currObj1, dist_func_param_); if (top_candidates.size() < ef_construction_ || lowerBound > dist1) { candidateSet.emplace(-dist1, candidate_id); -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch(getDataByInternalId(candidateSet.top().second), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH(getDataByInternalId(candidateSet.top().second), _MM_HINT_T0); if (!isMarkedDeleted(candidate_id)) top_candidates.emplace(dist1, candidate_id); @@ -395,25 +398,18 @@ class HierarchicalNSW : public AlgorithmInterface { metric_distance_computations+=size; } -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0); - _mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); - _mm_prefetch(data_level0_memory_ + (*(data + 1)) * size_data_per_element_ + offsetData_, _MM_HINT_T0); - _mm_prefetch((char *) (data + 2), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1)), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1) + kCacheLineSize), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(data_level0_memory_[*(data + 1)] + offsetData_, _MM_HINT_T0); + HNSWLIB_MM_PREFETCH((char *) (data + 2), _MM_HINT_T0); for (size_t j = 1; j <= size; j++) { int candidate_id = *(data + j); -// if (candidate_id == 0) continue; -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch((char *) (visited_array + *(data + j + 1)), _MM_HINT_T0); - _mm_prefetch(data_level0_memory_ + (*(data + j + 1)) * size_data_per_element_ + offsetData_, - _MM_HINT_T0); //////////// -#endif -#endif + if (j < size) { + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + j + 1)), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(data_level0_memory_[*(data + j + 1)] + offsetData_, + _MM_HINT_T0); + } if (!(visited_array[candidate_id] == visited_array_tag)) { visited_array[candidate_id] = visited_array_tag; @@ -429,13 +425,9 @@ class HierarchicalNSW : public AlgorithmInterface { if (flag_consider_candidate) { candidate_set.emplace(-dist, candidate_id); -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_ + + HNSWLIB_MM_PREFETCH(data_level0_memory_[candidate_set.top().second] + offsetLevel0_, /////////// _MM_HINT_T0); //////////////////////// -#endif -#endif if (bare_bone_search || (!isMarkedDeleted(candidate_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))) { @@ -518,17 +510,18 @@ class HierarchicalNSW : public AlgorithmInterface { linklistsizeint *get_linklist0(tableint internal_id) const { - return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_); + return (linklistsizeint *) (data_level0_memory_[internal_id] + offsetLevel0_); } linklistsizeint *get_linklist0(tableint internal_id, char *data_level0_memory_) const { - return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_); + return (linklistsizeint *) (data_level0_memory_[internal_id] + offsetLevel0_); } linklistsizeint *get_linklist(tableint internal_id, int level) const { - return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_); + assert(level > 0); + return (linklistsizeint *) (getLinkListPtr(internal_id) + (level - 1) * size_links_per_element_); } @@ -681,16 +674,10 @@ class HierarchicalNSW : public AlgorithmInterface { std::vector(new_max_elements).swap(link_list_locks_); // Reallocate base layer - char * data_level0_memory_new = (char *) realloc(data_level0_memory_, new_max_elements * size_data_per_element_); - if (data_level0_memory_new == nullptr) - return Status("Not enough memory: resizeIndex failed to allocate base layer"); - data_level0_memory_ = data_level0_memory_new; + data_level0_memory_.resize(new_max_elements); // Reallocate all other layers - char ** linkLists_new = (char **) realloc(linkLists_, sizeof(void *) * new_max_elements); - if (linkLists_new == nullptr) - return Status("Not enough memory: resizeIndex failed to allocate other layers"); - linkLists_ = linkLists_new; + linkLists_.resize(new_max_elements); max_elements_ = new_max_elements; return OkStatus(); @@ -742,13 +729,13 @@ class HierarchicalNSW : public AlgorithmInterface { writeBinaryPOD(output, mult_); writeBinaryPOD(output, ef_construction_); - output.write(data_level0_memory_, cur_element_count * size_data_per_element_); + data_level0_memory_.writeToStream(output, cur_element_count); for (size_t i = 0; i < cur_element_count; i++) { unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0; writeBinaryPOD(output, linkListSize); if (linkListSize) - output.write(linkLists_[i], linkListSize); + output.write(getLinkListPtr(i), linkListSize); } output.close(); return OkStatus(); @@ -823,10 +810,12 @@ class HierarchicalNSW : public AlgorithmInterface { input.seekg(pos, input.beg); - data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); - if (data_level0_memory_ == nullptr) - return Status("Not enough memory: loadIndex failed to allocate level0"); - input.read(data_level0_memory_, cur_element_count * size_data_per_element_); + data_level0_memory_ = ChunkedArray( + size_data_per_element_, + num_elements_per_chunk_, + max_elements, + kCacheLineSize); + data_level0_memory_.readFromStream(input, cur_element_count); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -836,9 +825,9 @@ class HierarchicalNSW : public AlgorithmInterface { visited_list_pool_.reset(new VisitedListPool(1, max_elements)); - linkLists_ = (char **) malloc(sizeof(void *) * max_elements); - if (linkLists_ == nullptr) - return Status("Not enough memory: loadIndex failed to allocate linklists"); + linkLists_ = ChunkedArray( + sizeof(void *), num_elements_per_chunk_, max_elements, 0); + element_levels_ = std::vector(max_elements); revSize_ = 1.0 / mult_; ef_ = 10; @@ -848,13 +837,13 @@ class HierarchicalNSW : public AlgorithmInterface { readBinaryPOD(input, linkListSize); if (linkListSize == 0) { element_levels_[i] = 0; - linkLists_[i] = nullptr; + setLinkListPtr(i, nullptr); } else { element_levels_[i] = linkListSize / size_links_per_element_; - linkLists_[i] = (char *) malloc(linkListSize); - if (linkLists_[i] == nullptr) + setLinkListPtr(i, (char *) malloc(linkListSize)); + if (getLinkListPtr(i) == nullptr) return Status("Not enough memory: loadIndex failed to allocate linklist"); - input.read(linkLists_[i], linkListSize); + input.read(getLinkListPtr(i), linkListSize); } } @@ -1131,17 +1120,11 @@ class HierarchicalNSW : public AlgorithmInterface { data = get_linklist_at_level(currObj, level); int size = getListCount(data); tableint *datal = (tableint *) (data + 1); -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch(getDataByInternalId(*datal), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH(getDataByInternalId(*datal), _MM_HINT_T0); for (int i = 0; i < size; i++) { -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch(getDataByInternalId(*(datal + i + 1)), _MM_HINT_T0); -#endif -#endif + if (i + 1 < size) { + HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + i + 1)), _MM_HINT_T0); + } tableint cand = datal[i]; dist_t d = fstdistfunc_(dataPoint, getDataByInternalId(cand), dist_func_param_); if (d < curdist) { @@ -1262,18 +1245,19 @@ class HierarchicalNSW : public AlgorithmInterface { tableint currObj = enterpoint_node_; tableint enterpoint_copy = enterpoint_node_; - memset(data_level0_memory_ + cur_c * size_data_per_element_ + offsetLevel0_, 0, size_data_per_element_); + memset(data_level0_memory_[cur_c] + offsetLevel0_, 0, size_data_per_element_); // Initialisation of the data and label memcpy(getExternalLabeLp(cur_c), &label, sizeof(labeltype)); memcpy(getDataByInternalId(cur_c), data_point, data_size_); if (curlevel) { - linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1); - if (linkLists_[cur_c] == nullptr) { + size_t link_list_num_bytes = size_links_per_element_ * curlevel + 1; + setLinkListPtr(cur_c, (char *) malloc(link_list_num_bytes)); + if (getLinkListPtr(cur_c) == nullptr) { return Status("Not enough memory: addPoint failed to allocate linklist"); } - memset(linkLists_[cur_c], 0, size_links_per_element_ * curlevel + 1); + memset(getLinkListPtr(cur_c), 0, link_list_num_bytes); } if ((signed)currObj != -1) { diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 570e876b..e06908f9 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -26,6 +26,11 @@ #endif #endif +#include + +#include +#include + #if defined(USE_AVX) || defined(USE_SSE) #ifdef _MSC_VER #include @@ -356,8 +361,220 @@ class AlgorithmInterface { } }; +namespace internal { + +struct FreeDeleter { + void operator()(void* ptr) const { + std::free(ptr); + } +}; + +using MallocUniqueCharArrayPtr = std::unique_ptr; + +// Allocates the given number of bytes as a special kind of a unique pointer. +// Does not initialize the memory. +MallocUniqueCharArrayPtr makeUniqueCharArray(size_t n_bytes) { + char* raw_ptr = static_cast(malloc(n_bytes)); + return MallocUniqueCharArrayPtr(raw_ptr); +} + +} // namespace internal + +// Manages a large, array-like data structure by allocating memory in smaller, +// fixed-size blocks called "chunks." This class provides a flat, array-like +// view over a large collection of elements without needing a single, massive +// contiguous memory allocation. +// +// It provides random access via `operator[]`, which internally maps an index +// to the correct chunk and the element's offset within it. The size of the +// elements and the number of elements per chunk are configured at construction. +// +// The class is non-copyable to prevent expensive deep copies but is movable for +// efficient transfers of ownership. The template parameter `ElementPointerType` +// specifies the pointer type used to access elements, e.g. `char*` if pointer +// arithmetics are required, or `void*` if the result would be immediately cast +// into another pointer type. +template +class ChunkedArray { + public: + static_assert(std::is_pointer::value, + "Template parameter ElementPointerType must be a pointer."); + ChunkedArray() + : element_byte_size_(0), + elements_per_chunk_(0), + element_count_(0), + chunk_padding_bytes_(0) { + } + + ChunkedArray(size_t element_byte_size, + size_t elements_per_chunk, + size_t element_count, + size_t chunk_padding_bytes) : + element_byte_size_(element_byte_size), + elements_per_chunk_(elements_per_chunk), + element_count_(0), + chunk_padding_bytes_(chunk_padding_bytes) { + resize(element_count); + } + + ChunkedArray(const ChunkedArray& other) = delete; + ChunkedArray& operator=(const ChunkedArray& other) = delete; + + ChunkedArray(ChunkedArray&& other) noexcept { + swap(other); + } + + ChunkedArray& operator=(ChunkedArray&& other) noexcept { + if (this != &other) { + swap(other); + } + return *this; + } + + void swap(ChunkedArray& other) noexcept { + std::swap(element_byte_size_, other.element_byte_size_); + std::swap(elements_per_chunk_, other.elements_per_chunk_); + std::swap(element_count_, other.element_count_); + std::swap(chunks_, other.chunks_); + std::swap(chunk_padding_bytes_, other.chunk_padding_bytes_); + } + + ~ChunkedArray() { + } + + size_t getCapacity() const { + return element_count_; + } + + size_t getSizePerElement() const { + return element_byte_size_; + } + + size_t getSizePerChunk() const { + return elements_per_chunk_ * element_byte_size_; + } + + ElementPointerType operator[](size_t i) const { +#ifndef NDEBUG + if (i >= getCapacity()) { + HNSWERR << "Chunked array index out of range: i=" << i + << ", capacity=" << getCapacity() << std::endl; + } + assert(i < getCapacity()); +#endif + if (i >= getCapacity()) return nullptr; + size_t chunk_index = i / elements_per_chunk_; + size_t index_in_chunk = i % elements_per_chunk_; + return reinterpret_cast( + chunks_[chunk_index].get() + element_byte_size_ * index_in_chunk + ); + } + + void clear() { + chunks_.clear(); + element_count_ = 0; + } + + void resize(size_t new_element_count) { + size_t chunk_count = getChunkCount(element_count_); + size_t new_chunk_count = getChunkCount(new_element_count); + + chunks_.resize(new_chunk_count); + for (size_t i = chunk_count; i < new_chunk_count; i++) { + chunks_[i] = ::hnswlib::internal::makeUniqueCharArray( + getSizePerChunk() + chunk_padding_bytes_); + } + + element_count_ = new_element_count; + } + + void writeToStream(std::ostream& output, size_t num_elements_to_write) { + assert(num_elements_to_write <= element_count_); + size_t num_chunks_to_write = getChunkCount(num_elements_to_write); + size_t last_chunk_bytes = getLastChunkBytes(num_elements_to_write); + for (size_t i = 0; i < num_chunks_to_write; ++i) { + output.write( + chunks_[i].get(), + i + 1 == num_chunks_to_write ? last_chunk_bytes + : getSizePerChunk()); + } + } + + void readFromStream(std::istream& input, size_t num_elements_to_read) { + assert(num_elements_to_read <= element_count_); + size_t num_chunks_to_read = getChunkCount(num_elements_to_read); + size_t last_chunk_bytes = getLastChunkBytes(num_elements_to_read); + for (size_t i = 0; i < num_chunks_to_read; ++i) { + input.read( + chunks_[i].get(), + i + 1 == num_chunks_to_read ? last_chunk_bytes + : getSizePerChunk()); + } + } + + void copyTo(char* destination, size_t num_bytes) { + size_t chunk_index = 0; + size_t bytes_per_chunk = getSizePerChunk(); + while (num_bytes > 0) { + size_t cur_size = std::min(bytes_per_chunk, num_bytes); + memcpy(destination, chunks_[chunk_index].get(), cur_size); + num_bytes -= cur_size; + destination += cur_size; + } + } + + void copyFrom(const char* source, size_t num_bytes) { + size_t chunk_index = 0; + size_t bytes_per_chunk = getSizePerChunk(); + while (num_bytes > 0) { + size_t cur_size = std::min(bytes_per_chunk, num_bytes); + memcpy(chunks_[chunk_index].get(), source, cur_size); + num_bytes -= cur_size; + source += cur_size; + } + } + + private: + size_t getChunkCount(size_t element_count) const { + return (element_count + elements_per_chunk_ - 1) / elements_per_chunk_; + } + + // Returns the byte size of the last chunk if pretend the element count is + // the given number. + size_t getLastChunkBytes(size_t element_count) { + size_t last_chunk_num_elements = element_count % elements_per_chunk_; + if (last_chunk_num_elements == 0) { + // Last chunk is whole. + last_chunk_num_elements = elements_per_chunk_; + } + return last_chunk_num_elements * element_byte_size_; + } + + size_t element_byte_size_; + size_t elements_per_chunk_; + size_t element_count_; + std::deque chunks_; + size_t chunk_padding_bytes_; +}; + } // namespace hnswlib +#if defined(USE_SSE) && HNSWLIB_USE_PREFETCH +#if HNSWLIB_DEBUG_PREFETCH +// This mode is used to find prefetch statements causing range check errors in +// tests. We only print line numbers, which makes the output compact enough to +// catch range check errors in some tests. +#define HNSWLIB_MM_PREFETCH(address, hint) do { \ + std::cout << __LINE__ << " "; \ + _mm_prefetch(address, hint); \ +} while (0) +#else +#define HNSWLIB_MM_PREFETCH(address, hint) _mm_prefetch(address, hint) +#endif +#else +#define HNSWLIB_MM_PREFETCH(address, hint) +#endif + #include "space_l2.h" #include "space_ip.h" #include "stop_condition.h" diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index babf9741..d1900d90 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -383,13 +383,13 @@ class Index { memset(link_list_npy, 0, link_npy_size); - memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); + appr_alg->data_level0_memory_.copyTo(data_level0_npy, level0_npy_size); memcpy(element_levels_npy, appr_alg->element_levels_.data(), appr_alg->element_levels_.size() * sizeof(int)); for (size_t i = 0; i < appr_alg->cur_element_count; i++) { - size_t linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + size_t linkListSize = appr_alg->size_links_per_element_ * appr_alg->element_levels_[i]; if (linkListSize) { - memcpy(link_list_npy + link_npy_offsets[i], appr_alg->linkLists_[i], linkListSize); + memcpy(link_list_npy + link_npy_offsets[i], appr_alg->getLinkListPtr(i), linkListSize); } } @@ -576,18 +576,19 @@ class Index { link_npy_size += linkListSize; } - memcpy(appr_alg->data_level0_memory_, data_level0_npy.data(), data_level0_npy.nbytes()); + appr_alg->data_level0_memory_.copyFrom(data_level0_npy.data(), data_level0_npy.nbytes()); for (size_t i = 0; i < appr_alg->max_elements_; i++) { size_t linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; if (linkListSize == 0) { - appr_alg->linkLists_[i] = nullptr; + appr_alg->setLinkListPtr(i, nullptr); } else { - appr_alg->linkLists_[i] = (char*)malloc(linkListSize); - if (appr_alg->linkLists_[i] == nullptr) + char* linkListPtr = reinterpret_cast(malloc(linkListSize)); + if (linkListPtr == nullptr) HNSWLIB_THROW_RUNTIME_ERROR("Not enough memory: loadIndex failed to allocate linklist"); + appr_alg->setLinkListPtr(i, linkListPtr); - memcpy(appr_alg->linkLists_[i], link_list_npy.data() + link_npy_offsets[i], linkListSize); + memcpy(linkListPtr, link_list_npy.data() + link_npy_offsets[i], linkListSize); } }