diff --git a/.gitignore b/.gitignore index c4045e98..d2cde965 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ -hnswlib.egg-info/ -build/ -dist/ -tmp/ -python_bindings/tests/__pycache__/ -*.pyd -hnswlib.cpython*.so +hnswlib.egg-info/ +build/ +dist/ +tmp/ +python_bindings/tests/__pycache__/ +*.pyd +hnswlib.cpython*.so +var/ diff --git a/.travis.yml b/.travis.yml index 893441e9..2c3c9960 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,9 +30,8 @@ jobs: install: - | - pip install -r requirements.txt - python setup.py install + python -m pip install . script: - | - python setup.py test + python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" diff --git a/Makefile b/Makefile index 792b246e..b5e8fda9 100644 --- a/Makefile +++ b/Makefile @@ -3,12 +3,13 @@ pypi: dist dist: -rm dist/* - python3 setup.py sdist + pip install build + python3 -m build --sdist test: - python3 setup.py test + python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" clean: rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so -.PHONY: dist \ No newline at end of file +.PHONY: dist diff --git a/README.md b/README.md index 89cce5ce..90105f0d 100644 --- a/README.md +++ b/README.md @@ -213,8 +213,9 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat You can install from sources: ```bash apt-get install -y python-setuptools python-pip -pip3 install pybind11 numpy setuptools -python3 setup.py install +git clone https://github.com/nmslib/hnswlib.git +cd hnswlib +pip install . ``` or you can install via pip: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..e00b3fb8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel", + "numpy>=1.10.0", + "pybind11>=2.0", +] + +build-backend = "setuptools.build_meta" diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index 009b2164..d718bc3b 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -1,11 +1,13 @@ import os import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): - import hnswlib - import numpy as np dim = 16 num_elements = 10000 @@ -41,7 +43,7 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) # Serializing and deleting the index: index_path = 'first_half.bin' @@ -61,10 +63,6 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) os.remove(index_path) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/python_bindings/tests/bindings_test_getdata.py b/python_bindings/tests/bindings_test_getdata.py index 3e234518..8655d7f8 100644 --- a/python_bindings/tests/bindings_test_getdata.py +++ b/python_bindings/tests/bindings_test_getdata.py @@ -1,11 +1,13 @@ import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testGettingItems(self): print("\n**** Getting the data by label test ****\n") - import hnswlib - import numpy as np dim = 16 num_elements = 10000 @@ -42,6 +44,3 @@ def testGettingItems(self): # After adding them, all labels should be retrievable returned_items = p.get_items(labels) self.assertSequenceEqual(data.tolist(), returned_items) - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index e44b0988..5c13e198 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -1,131 +1,127 @@ import os import unittest +import numpy as np -class RandomSelfTestCase(unittest.TestCase): - def testRandomSelf(self): - for idx in range(16): - print("\n**** Index save-load test ****\n") - import hnswlib - import numpy as np - - np.random.seed(idx) - dim = 16 - num_elements = 10000 - - # Generating sample data - data = np.float32(np.random.random((num_elements, dim))) - - # Declaring index - p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip - - # Initing index - # max_elements - the maximum number of elements, should be known beforehand - # (probably will be made optional in the future) - # - # ef_construction - controls index search speed/build speed tradeoff - # M - is tightly connected with internal dimensionality of the data - # stronlgy affects the memory consumption - - p.init_index(max_elements = num_elements, ef_construction = 100, M = 16) - - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(100) - - p.set_num_threads(4) # by default using all available cores - - # We split the data in two batches: - data1 = data[:num_elements // 2] - data2 = data[num_elements // 2:] - - print("Adding first batch of %d elements" % (len(data1))) - p.add_items(data1) - - # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data1, k=1) - - items=p.get_items(labels) - - # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) - - # Check that the returned element data is correct: - diff_with_gt_labels=np.mean(np.abs(data1-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) - - # Serializing and deleting the index. - # We need the part to check that serialization is working properly. - - index_path = 'first_half.bin' - print("Saving index to '%s'" % index_path) - p.save_index(index_path) - print("Saved. Deleting...") - del p - print("Deleted") - - print("\n**** Mark delete test ****\n") - # Reiniting, loading the index - print("Reiniting") - p = hnswlib.Index(space='l2', dim=dim) - - print("\nLoading index from '%s'\n" % index_path) - p.load_index(index_path) - p.set_ef(100) - - print("Adding the second batch of %d elements" % (len(data2))) - p.add_items(data2) - - # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data, k=1) - items=p.get_items(labels) - - # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) - - # Check that the returned element data is correct: - diff_with_gt_labels=np.mean(np.abs(data-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # deleting index. - - # Checking that all labels are returned correctly: - sorted_labels=sorted(p.get_ids_list()) - self.assertEqual(np.sum(~np.asarray(sorted_labels)==np.asarray(range(num_elements))),0) - - # Delete data1 - labels1, _ = p.knn_query(data1, k=1) - - for l in labels1: - p.mark_deleted(l[0]) - labels2, _ = p.knn_query(data2, k=1) - items=p.get_items(labels2) - diff_with_gt_labels=np.mean(np.abs(data2-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console - - - labels1_after, _ = p.knn_query(data1, k=1) - for la in labels1_after: - for lb in labels1: - if la[0] == lb[0]: - self.assertTrue(False) - print("All the data in data1 are removed") +import hnswlib - # checking saving/loading index with elements marked as deleted - del_index_path = "with_deleted.bin" - p.save_index(del_index_path) - p = hnswlib.Index(space='l2', dim=dim) - p.load_index(del_index_path) - p.set_ef(100) - labels1_after, _ = p.knn_query(data1, k=1) - for la in labels1_after: - for lb in labels1: - if la[0] == lb[0]: - self.assertTrue(False) - - os.remove(index_path) - os.remove(del_index_path) +class RandomSelfTestCase(unittest.TestCase): + def testRandomSelf(self): + for idx in range(16): + print("\n**** Index save-load test ****\n") + np.random.seed(idx) + dim = 16 + num_elements = 10000 + # Generating sample data + data = np.float32(np.random.random((num_elements, dim))) -if __name__ == "__main__": - unittest.main() + # Declaring index + p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + + # Initing index + # max_elements - the maximum number of elements, should be known beforehand + # (probably will be made optional in the future) + # + # ef_construction - controls index search speed/build speed tradeoff + # M - is tightly connected with internal dimensionality of the data + # stronlgy affects the memory consumption + + p.init_index(max_elements=num_elements, ef_construction=100, M=16) + + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(100) + + p.set_num_threads(4) # by default using all available cores + + # We split the data in two batches: + data1 = data[:num_elements // 2] + data2 = data[num_elements // 2:] + + print("Adding first batch of %d elements" % (len(data1))) + p.add_items(data1) + + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(data1, k=1) + + items=p.get_items(labels) + + # Check the recall: + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) + + # Check that the returned element data is correct: + diff_with_gt_labels=np.mean(np.abs(data1-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) + + # Serializing and deleting the index. + # We need the part to check that serialization is working properly. + + index_path = 'first_half.bin' + print("Saving index to '%s'" % index_path) + p.save_index(index_path) + print("Saved. Deleting...") + del p + print("Deleted") + + print("\n**** Mark delete test ****\n") + # Reiniting, loading the index + print("Reiniting") + p = hnswlib.Index(space='l2', dim=dim) + + print("\nLoading index from '%s'\n" % index_path) + p.load_index(index_path) + p.set_ef(100) + + print("Adding the second batch of %d elements" % (len(data2))) + p.add_items(data2) + + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(data, k=1) + items=p.get_items(labels) + + # Check the recall: + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) + + # Check that the returned element data is correct: + diff_with_gt_labels=np.mean(np.abs(data-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index. + + # Checking that all labels are returned correctly: + sorted_labels=sorted(p.get_ids_list()) + self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) + + # Delete data1 + labels1, _ = p.knn_query(data1, k=1) + + for l in labels1: + p.mark_deleted(l[0]) + labels2, _ = p.knn_query(data2, k=1) + items=p.get_items(labels2) + diff_with_gt_labels = np.mean(np.abs(data2-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) # console + + labels1_after, _ = p.knn_query(data1, k=1) + for la in labels1_after: + for lb in labels1: + if la[0] == lb[0]: + self.assertTrue(False) + print("All the data in data1 are removed") + + # checking saving/loading index with elements marked as deleted + del_index_path = "with_deleted.bin" + p.save_index(del_index_path) + p = hnswlib.Index(space='l2', dim=dim) + p.load_index(del_index_path) + p.set_ef(100) + + labels1_after, _ = p.knn_query(data1, k=1) + for la in labels1_after: + for lb in labels1: + if la[0] == lb[0]: + self.assertTrue(False) + + os.remove(index_path) + os.remove(del_index_path) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index 6c3a826a..3a42df2e 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -1,28 +1,30 @@ +import pickle import unittest import numpy as np + import hnswlib -import pickle def get_dist(metric, pt1, pt2): if metric == 'l2': return np.sum((pt1-pt2)**2) elif metric == 'ip': - return 1. - np.sum(np.multiply(pt1,pt2)) + return 1. - np.sum(np.multiply(pt1, pt2)) elif metric == 'cosine': - return 1. - np.sum(np.multiply(pt1,pt2)) / (np.sum(pt1**2) * np.sum(pt2**2))**.5 + return 1. - np.sum(np.multiply(pt1, pt2)) / (np.sum(pt1**2) * np.sum(pt2**2))**.5 + def brute_force_distances(metric, items, query_items, k): - dists=np.zeros((query_items.shape[0], items.shape[0])) + dists = np.zeros((query_items.shape[0], items.shape[0])) for ii in range(items.shape[0]): for jj in range(query_items.shape[0]): - dists[jj,ii]=get_dist(metric, items[ii, :], query_items[jj, :]) + dists[jj,ii] = get_dist(metric, items[ii, :], query_items[jj, :]) labels = np.argsort(dists, axis=1) # equivalent, but faster: np.argpartition(dists, range(k), axis=1) dists = np.sort(dists, axis=1) # equivalent, but faster: np.partition(dists, range(k), axis=1) - return labels[:,:k], dists[:,:k] + return labels[:, :k], dists[:, :k] def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): @@ -36,14 +38,15 @@ def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thr if err > err_thresh: err_total += 1 - self.assertLessEqual( err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") + self.assertLessEqual(err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") - wrong_dists=np.sum(((brute_d- ann_d)**2.)>1e-3) + wrong_dists = np.sum(((brute_d - ann_d)**2.) > 1e-3) if wrong_dists > 0: - dists_count=brute_d.shape[0]*brute_d.shape[1] + dists_count = brute_d.shape[0]*brute_d.shape[1] print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") - self.assertLessEqual( wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + self.assertLessEqual(wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + def test_space_main(self, space, dim): @@ -55,16 +58,16 @@ def test_space_main(self, space, dim): p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip print(f"Running pickle tests for {p}") - p.num_threads=self.num_threads # by default using all available cores + p.num_threads = self.num_threads # by default using all available cores - p0=pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index - p.init_index(max_elements = self.num_elements, ef_construction = self.ef_construction, M = self.M) - p0.init_index(max_elements = self.num_elements, ef_construction = self.ef_construction, M = self.M) + p0 = pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index + p.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) + p0.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) - p.ef=self.ef - p0.ef=self.ef + p.ef = self.ef + p0.ef = self.ef - p1=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + p1 = pickle.loads(pickle.dumps(p)) ### pickle Index before adding items ### add items to ann index p,p0,p1 p.add_items(data) @@ -78,7 +81,7 @@ def test_space_main(self, space, dim): self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") ### Test if returned distances are same - l, d = p.knn_query(test_data, k=self.k) + l, d = p.knn_query(test_data, k=self.k) l0, d0 = p0.knn_query(test_data, k=self.k) l1, d1 = p1.knn_query(test_data, k=self.k) l2, d2 = p2.knn_query(test_data, k=self.k) @@ -90,9 +93,9 @@ def test_space_main(self, space, dim): ### check if ann results match brute-force search ### allow for 2 labels to be missing from ann results check_ann_results(self, space, data, test_data, self.k, l, d, - err_thresh = self.label_err_thresh, - total_thresh = self.item_err_thresh, - dists_thresh = self.dists_err_thresh) + err_thresh=self.label_err_thresh, + total_thresh=self.item_err_thresh, + dists_thresh=self.dists_err_thresh) check_ann_results(self, space, data, test_data, self.k, l2, d2, err_thresh=self.label_err_thresh, @@ -118,7 +121,6 @@ def test_space_main(self, space, dim): self.assertEqual(p2.ef_construction, self.ef_construction, "incorrect value of p2.ef_construction") - class PickleUnitTests(unittest.TestCase): def setUp(self): @@ -133,10 +135,10 @@ def setUp(self): self.num_threads = 4 self.k = 25 - self.label_err_thresh=5 ### max number of missing labels allowed per test item - self.item_err_thresh=5 ### max number of items allowed with incorrect labels + self.label_err_thresh = 5 ### max number of missing labels allowed per test item + self.item_err_thresh = 5 ### max number of items allowed with incorrect labels - self.dists_err_thresh=50 ### for two matrices, d1 and d2, dists_err_thresh controls max + self.dists_err_thresh = 50 ### for two matrices, d1 and d2, dists_err_thresh controls max ### number of value pairs that are allowed to be different in d1 and d2 ### i.e., number of values that are (d1-d2)**2>1e-3 @@ -148,6 +150,3 @@ def test_l2_space(self): def test_cosine_space(self): test_space_main(self, 'cosine', 512) - -if __name__ == "__main__": - unittest.main() diff --git a/python_bindings/tests/bindings_test_resize.py b/python_bindings/tests/bindings_test_resize.py index 9411af64..3c4e3e4f 100644 --- a/python_bindings/tests/bindings_test_resize.py +++ b/python_bindings/tests/bindings_test_resize.py @@ -1,12 +1,15 @@ import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): for idx in range(16): print("\n**** Index resize test ****\n") - import hnswlib - import numpy as np + np.random.seed(idx) dim = 16 num_elements = 10000 @@ -25,7 +28,7 @@ def testRandomSelf(self): # M - is tightly connected with internal dimensionality of the data # stronlgy affects the memory consumption - p.init_index(max_elements = num_elements//2, ef_construction = 100, M = 16) + p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search @@ -43,20 +46,18 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - items=p.get_items(list(range(len(data1)))) + items = p.get_items(list(range(len(data1)))) # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) # Check that the returned element data is correct: - diff_with_gt_labels=np.max(np.abs(data1-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) + diff_with_gt_labels = np.max(np.abs(data1-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) print("Resizing the index") p.resize_index(num_elements) - - print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) @@ -65,18 +66,12 @@ def testRandomSelf(self): items=p.get_items(list(range(num_elements))) # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) # Check that the returned element data is correct: diff_with_gt_labels=np.max(np.abs(data-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # Checking that all labels are returned correcly: sorted_labels=sorted(p.get_ids_list()) - self.assertEqual(np.sum(~np.asarray(sorted_labels)==np.asarray(range(num_elements))),0) - - - - -if __name__ == "__main__": - unittest.main() + self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 81fbf192..00000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -numpy>=1.10.0 -pybind11>=2.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 002f3893..929bc211 100644 --- a/setup.py +++ b/setup.py @@ -1,19 +1,28 @@ import os -from setuptools import setup, Extension -from setuptools.command.build_ext import build_ext import sys + +import numpy as np +import pybind11 import setuptools +from setuptools import Extension, setup +from setuptools.command.build_ext import build_ext __version__ = '0.4.0' + +include_dirs = [ + pybind11.get_include(), + np.get_include(), +] + # compatibility when run in python_bindings bindings_dir = 'python_bindings' if bindings_dir in os.path.basename(os.getcwd()): source_files = ['./bindings.cpp'] - include_dirs = ['../hnswlib/'] + include_dirs.extend(['../hnswlib/']) else: source_files = ['./python_bindings/bindings.cpp'] - include_dirs = ['./hnswlib/'] + include_dirs.extend(['./hnswlib/']) libraries = [] @@ -90,21 +99,9 @@ def build_extensions(self): elif ct == 'msvc': opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - # extend include dirs here (don't assume numpy/pybind11 are installed when first run, since - # pip could have installed them as part of executing this script - import pybind11 - import numpy as np for ext in self.extensions: ext.extra_compile_args.extend(opts) ext.extra_link_args.extend(self.link_opts.get(ct, [])) - ext.include_dirs.extend([ - # Path to pybind11 headers - pybind11.get_include(), - pybind11.get_include(True), - - # Path to numpy headers - np.get_include() - ]) build_ext.build_extensions(self) @@ -117,8 +114,7 @@ def build_extensions(self): url='https://github.com/yurymalkov/hnsw', long_description="""hnsw""", ext_modules=ext_modules, - install_requires=['pybind11>=2.0', 'numpy'], + install_requires=['numpy'], cmdclass={'build_ext': BuildExt}, - test_suite="python_bindings.tests", zip_safe=False, )