Skip to content

Commit 6e946ed

Browse files
authored
Merge branch 'nmslib:master' into fix-prefetch
2 parents b17e455 + 359b2ba commit 6e946ed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1301
-263
lines changed

.github/workflows/build.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ jobs:
2020

2121
- name: Test
2222
timeout-minutes: 15
23-
run: python -m unittest discover -v --start-directory python_bindings/tests --pattern "*_test*.py"
23+
run: |
24+
python -m unittest discover -v --start-directory examples/python --pattern "example*.py"
25+
python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py"
2426
2527
test_cpp:
2628
runs-on: ${{matrix.os}}
@@ -48,7 +50,7 @@ jobs:
4850
- name: Prepare test data
4951
run: |
5052
pip install numpy
51-
cd examples
53+
cd tests/cpp/
5254
python update_gen_data.py
5355
shell: bash
5456

@@ -59,6 +61,12 @@ jobs:
5961
if [ "$RUNNER_OS" == "Windows" ]; then
6062
cp ./Release/* ./
6163
fi
64+
./example_search
65+
./example_filter
66+
./example_replace_deleted
67+
./example_mt_search
68+
./example_mt_filter
69+
./example_mt_replace_deleted
6270
./searchKnnCloserFirst_test
6371
./searchKnnWithFilter_test
6472
./multiThreadLoad_test

ALGO_PARAMS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,5 @@ ef_construction leads to longer construction, but better index quality. At some
2727
not improve the quality of the index. One way to check if the selection of ef_construction was ok is to measure a recall
2828
for M nearest neighbor search when ```ef``` =```ef_construction```: if the recall is lower than 0.9, than there is room
2929
for improvement.
30-
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extened by saving/loading(load_index
30+
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extended by saving/loading (load_index
3131
function has a parameter which defines the new maximum number of elements).

CMakeLists.txt

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,41 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
1616
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
1717
endif()
1818

19-
add_executable(test_updates examples/updates_test.cpp)
19+
# examples
20+
add_executable(example_search examples/cpp/example_search.cpp)
21+
target_link_libraries(example_search hnswlib)
22+
23+
add_executable(example_filter examples/cpp/example_filter.cpp)
24+
target_link_libraries(example_filter hnswlib)
25+
26+
add_executable(example_replace_deleted examples/cpp/example_replace_deleted.cpp)
27+
target_link_libraries(example_replace_deleted hnswlib)
28+
29+
add_executable(example_mt_search examples/cpp/example_mt_search.cpp)
30+
target_link_libraries(example_mt_search hnswlib)
31+
32+
add_executable(example_mt_filter examples/cpp/example_mt_filter.cpp)
33+
target_link_libraries(example_mt_filter hnswlib)
34+
35+
add_executable(example_mt_replace_deleted examples/cpp/example_mt_replace_deleted.cpp)
36+
target_link_libraries(example_mt_replace_deleted hnswlib)
37+
38+
# tests
39+
add_executable(test_updates tests/cpp/updates_test.cpp)
2040
target_link_libraries(test_updates hnswlib)
2141

22-
add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
42+
add_executable(searchKnnCloserFirst_test tests/cpp/searchKnnCloserFirst_test.cpp)
2343
target_link_libraries(searchKnnCloserFirst_test hnswlib)
2444

25-
add_executable(searchKnnWithFilter_test examples/searchKnnWithFilter_test.cpp)
45+
add_executable(searchKnnWithFilter_test tests/cpp/searchKnnWithFilter_test.cpp)
2646
target_link_libraries(searchKnnWithFilter_test hnswlib)
2747

28-
add_executable(multiThreadLoad_test examples/multiThreadLoad_test.cpp)
48+
add_executable(multiThreadLoad_test tests/cpp/multiThreadLoad_test.cpp)
2949
target_link_libraries(multiThreadLoad_test hnswlib)
3050

31-
add_executable(multiThread_replace_test examples/multiThread_replace_test.cpp)
51+
add_executable(multiThread_replace_test tests/cpp/multiThread_replace_test.cpp)
3252
target_link_libraries(multiThread_replace_test hnswlib)
3353

34-
add_executable(main main.cpp sift_1b.cpp)
54+
add_executable(main tests/cpp/main.cpp tests/cpp/sift_1b.cpp)
3555
target_link_libraries(main hnswlib)
3656
endif()

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ dist:
77
python3 -m build --sdist
88

99
test:
10-
python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
10+
python3 -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"
1111

1212
clean:
1313
rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so

README.md

Lines changed: 32 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,22 @@
11
# Hnswlib - fast approximate nearest neighbor search
2-
Header-only C++ HNSW implementation with python bindings.
2+
Header-only C++ HNSW implementation with python bindings, insertions and updates.
33

44
**NEWS:**
55

6+
**version 0.7.0**
67

7-
**version 0.6.2**
8-
9-
* Fixed a bug in saving of large pickles. The pickles with > 4GB could have been corrupted. Thanks Kai Wohlfahrt for reporting.
10-
* Thanks to ([@GuyAv46](https://github.com/GuyAv46)) hnswlib inner product now is more consitent accross architectures (SSE, AVX, etc).
11-
*
12-
13-
**version 0.6.1**
14-
15-
* Thanks to ([@tony-kuo](https://github.com/tony-kuo)) hnswlib AVX512 and AVX builds are not backwards-compatible with older SSE and non-AVX512 architectures.
16-
* Thanks to ([@psobot](https://github.com/psobot)) there is now a sencible message instead of segfault when passing a scalar to get_items.
17-
* Thanks to ([@urigoren](https://github.com/urigoren)) hnswlib has a lazy index creation python wrapper.
18-
19-
**version 0.6.0**
20-
* Thanks to ([@dyashuni](https://github.com/dyashuni)) hnswlib now uses github actions for CI, there is a search speedup in some scenarios with deletions. `unmark_deleted(label)` is now also a part of the python interface (note now it throws an exception for double deletions).
21-
* Thanks to ([@slice4e](https://github.com/slice4e)) we now support AVX512; thanks to ([@LTLA](https://github.com/LTLA)) the cmake interface for the lib is now updated.
22-
* Thanks to ([@alonre24](https://github.com/alonre24)) we now have a python bindings for brute-force (and examples for recall tuning: [TESTING_RECALL.md](TESTING_RECALL.md).
23-
* Thanks to ([@dorosy-yeong](https://github.com/dorosy-yeong)) there is a bug fixed in the handling large quantities of deleted elements and large K.
24-
25-
8+
* Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc)
9+
* Added python interface for filtering (though note its performance is limited by GIL) (#417) by [@gtsoukas](https://github.com/gtsoukas)
10+
* Added support for replacing the elements that were marked as delete with newly inserted elements (to control the size of the index, #418) by [@dyashuni](https://github.com/dyashuni)
11+
* Fixed data races/deadlocks in updates/insertion, added stress test for multithreaded operation (#418) by [@dyashuni](https://github.com/dyashuni)
12+
* Documentation, tests, exception handling, refactoring (#375, #379, #380, #395, #396, #401, #406, #404, #409, #410, #416, #415, #431, #432, #433) by [@jlmelville](https://github.com/jlmelville), [@dyashuni](https://github.com/dyashuni), [@kishorenc](https://github.com/kishorenc), [@korzhenevski](https://github.com/korzhenevski), [@yoshoku](https://github.com/yoshoku), [@jianshu93](https://github.com/jianshu93), [@PLNech](https://github.com/PLNech)
13+
* global linkages (#383) by [@MasterAler](https://github.com/MasterAler), USE_SSE usage in MSVC (#408) by [@alxvth](https://github.com/alxvth)
2614

2715

2816
### Highlights:
2917
1) Lightweight, header-only, no dependencies other than C++ 11
30-
2) Interfaces for C++, Java, Python and R (https://github.com/jlmelville/rcpphnsw).
31-
3) Has full support for incremental index construction. Has support for element deletions
18+
2) Interfaces for C++, Python, external support for Java and R (https://github.com/jlmelville/rcpphnsw).
19+
3) Has full support for incremental index construction and updating the elements. Has support for element deletions
3220
(by marking them in index). Index is picklable.
3321
4) Can work with custom user defined distances (C++).
3422
5) Significantly less memory footprint and faster build time compared to current nmslib's implementation.
@@ -50,7 +38,7 @@ Note that inner product is not an actual metric. An element can be closer to som
5038

5139
For other spaces use the nmslib library https://github.com/nmslib/nmslib.
5240

53-
#### Short API description
41+
#### API description
5442
* `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`.
5543

5644
`hnswlib.Index` methods:
@@ -80,7 +68,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.
8068
* `knn_query(data, k = 1, num_threads = -1, filter = None)` make a batch query for `k` closest elements for each element of the
8169
* `data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`).
8270
* `num_threads` sets the number of cpu threads to use (-1 means use default).
83-
* `filter` filters elements by its labels, returns elements with allowed ids
71+
* `filter` filters elements by its labels, returns elements with allowed ids. Note that search with a filter works slow in python in multithreaded mode. It is recommended to set `num_threads=1`
8472
* Thread-safe with other `knn_query` calls, but not with `add_items`.
8573

8674
* `load_index(path_to_index, max_elements = 0, allow_replace_deleted = False)` loads the index from persistence to the uninitialized index.
@@ -123,6 +111,12 @@ Properties of `hnswlib.Index` that support reading and writing:
123111
124112

125113
#### Python bindings examples
114+
[See more examples here](examples/python/EXAMPLES.md):
115+
* Creating index, inserting elements, searching, serialization/deserialization
116+
* Filtering during the search with a boolean function
117+
* Deleting the elements and reusing the memory of the deleted elements for newly added elements
118+
119+
An example of creating index, inserting elements, searching and pickle serialization:
126120
```python
127121
import hnswlib
128122
import numpy as np
@@ -229,103 +223,13 @@ labels, distances = p.knn_query(data, k=1)
229223
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
230224
```
231225

232-
An example with a filter:
233-
```python
234-
import hnswlib
235-
import numpy as np
236-
237-
dim = 16
238-
num_elements = 10000
239-
240-
# Generating sample data
241-
data = np.float32(np.random.random((num_elements, dim)))
242-
243-
# Declaring index
244-
hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
245-
246-
# Initiating index
247-
# max_elements - the maximum number of elements, should be known beforehand
248-
# (probably will be made optional in the future)
249-
#
250-
# ef_construction - controls index search speed/build speed tradeoff
251-
# M - is tightly connected with internal dimensionality of the data
252-
# strongly affects the memory consumption
253-
254-
hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16)
255-
256-
# Controlling the recall by setting ef:
257-
# higher ef leads to better accuracy, but slower search
258-
hnsw_index.set_ef(10)
259-
260-
# Set number of threads used during batch search/construction
261-
# By default using all available cores
262-
hnsw_index.set_num_threads(4)
263-
264-
print("Adding %d elements" % (len(data)))
265-
# Added elements will have consecutive ids
266-
hnsw_index.add_items(data, ids=np.arange(num_elements))
267-
268-
print("Querying only even elements")
269-
# Define filter function that allows only even ids
270-
filter_function = lambda idx: idx%2 == 0
271-
# Query the elements for themselves and search only for even elements:
272-
labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function)
273-
# labels contain only elements with even id
274-
```
275-
276-
An example with replacing of deleted elements:
277-
```python
278-
import hnswlib
279-
import numpy as np
280-
281-
dim = 16
282-
num_elements = 1_000
283-
max_num_elements = 2 * num_elements
284-
285-
# Generating sample data
286-
labels1 = np.arange(0, num_elements)
287-
data1 = np.float32(np.random.random((num_elements, dim))) # batch 1
288-
labels2 = np.arange(num_elements, 2 * num_elements)
289-
data2 = np.float32(np.random.random((num_elements, dim))) # batch 2
290-
labels3 = np.arange(2 * num_elements, 3 * num_elements)
291-
data3 = np.float32(np.random.random((num_elements, dim))) # batch 3
292-
293-
# Declaring index
294-
hnsw_index = hnswlib.Index(space='l2', dim=dim)
295-
296-
# Initiating index
297-
# max_elements - the maximum number of elements, should be known beforehand
298-
# (probably will be made optional in the future)
299-
#
300-
# ef_construction - controls index search speed/build speed tradeoff
301-
# M - is tightly connected with internal dimensionality of the data
302-
# strongly affects the memory consumption
303-
304-
# Enable replacing of deleted elements
305-
hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True)
306-
307-
# Controlling the recall by setting ef:
308-
# higher ef leads to better accuracy, but slower search
309-
hnsw_index.set_ef(10)
226+
#### C++ examples
227+
[See examples here](examples/cpp/EXAMPLES.md):
228+
* creating index, inserting elements, searching, serialization/deserialization
229+
* filtering during the search with a boolean function
230+
* deleting the elements and reusing the memory of the deleted elements for newly added elements
231+
* multithreaded usage
310232

311-
# Set number of threads used during batch search/construction
312-
# By default using all available cores
313-
hnsw_index.set_num_threads(4)
314-
315-
# Add batch 1 and 2 data
316-
hnsw_index.add_items(data1, labels1)
317-
hnsw_index.add_items(data2, labels2) # Note: maximum number of elements is reached
318-
319-
# Delete data of batch 2
320-
for label in labels2:
321-
hnsw_index.mark_deleted(label)
322-
323-
# Replace deleted elements
324-
# Maximum number of elements is reached therefore we cannot add new items,
325-
# but we can replace the deleted ones by using replace_deleted=True
326-
hnsw_index.add_items(data3, labels3, replace_deleted=True)
327-
# hnsw_index contains the data of batch 1 and batch 3 only
328-
```
329233

330234
### Bindings installation
331235

@@ -346,9 +250,9 @@ Contributions are highly welcome!
346250

347251
Please make pull requests against the `develop` branch.
348252

349-
When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality):
253+
When making changes please run tests (and please add a test to `tests/python` in case there is new functionality):
350254
```bash
351-
python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
255+
python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"
352256
```
353257

354258

@@ -360,20 +264,23 @@ https://github.com/facebookresearch/faiss
360264
["Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors"](https://arxiv.org/abs/1802.02422)
361265
(current state-of-the-art in compressed indexes, C++):
362266
https://github.com/dbaranchuk/ivf-hnsw
267+
* Amazon PECOS https://github.com/amzn/pecos
363268
* TOROS N2 (python, C++): https://github.com/kakao/n2
364269
* Online HNSW (C++): https://github.com/andrusha97/online-hnsw)
365270
* Go implementation: https://github.com/Bithack/go-hnsw
366271
* Python implementation (as a part of the clustering code by by Matteo Dell'Amico): https://github.com/matteodellamico/flexible-clustering
272+
* Julia implmentation https://github.com/JuliaNeighbors/HNSW.jl
367273
* Java implementation: https://github.com/jelmerk/hnswlib
368274
* Java bindings using Java Native Access: https://github.com/stepstone-tech/hnswlib-jna
369-
* .Net implementation: https://github.com/microsoft/HNSW.Net
275+
* .Net implementation: https://github.com/curiosity-ai/hnsw-sharp
370276
* CUDA implementation: https://github.com/js1010/cuhnsw
277+
* Rust implementation https://github.com/rust-cv/hnsw
371278
* Rust implementation for memory and thread safety purposes and There is A Trait to enable the user to implement its own distances. It takes as data slices of types T satisfying T:Serialize+Clone+Send+Sync.: https://github.com/jean-pierreBoth/hnswlib-rs
372279

373280
### 200M SIFT test reproduction
374281
To download and extract the bigann dataset (from root directory):
375282
```bash
376-
python3 download_bigann.py
283+
python tests/cpp/download_bigann.py
377284
```
378285
To compile:
379286
```bash
@@ -393,7 +300,7 @@ The size of the BigANN subset (in millions) is controlled by the variable **subs
393300
### Updates test
394301
To generate testing data (from root directory):
395302
```bash
396-
cd examples
303+
cd tests/cpp
397304
python update_gen_data.py
398305
```
399306
To compile (from root directory):

0 commit comments

Comments
 (0)