Merge pull request nmslib#284 from marekhanus/feature/pep

yurymalkov · web-flow · commit cc2b94f51334 · 2021-02-11T21:16:52.000-08:00
Feature/pep
diff --git a/examples/pyw_hnswlib.py b/examples/pyw_hnswlib.py
@@ -11,8 +11,8 @@ def __init__(self, space, dim):
         self.dict_labels = {}
         self.cur_ind = 0
 
-    def init_index(self, max_elements, ef_construction = 200, M = 16):
-        self.index.init_index(max_elements = max_elements, ef_construction = ef_construction, M = M)
+    def init_index(self, max_elements, ef_construction=200, M=16):
+        self.index.init_index(max_elements=max_elements, ef_construction=ef_construction, M=M)
 
     def add_items(self, data, ids=None):
         if ids is not None:
diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py
@@ -18,15 +18,15 @@ def testRandomSelf(self):
         # Declaring index
         p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
 
-        # Initing index
+        # Initiating index
         # max_elements - the maximum number of elements, should be known beforehand
         #     (probably will be made optional in the future)
         #
         # ef_construction - controls index search speed/build speed tradeoff
         # M - is tightly connected with internal dimensionality of the data
-        #     stronlgy affects the memory consumption
+        #     strongly affects the memory consumption
 
-        p.init_index(max_elements = num_elements, ef_construction = 100, M = 16)
+        p.init_index(max_elements=num_elements, ef_construction=100, M=16)
 
         # Controlling the recall by setting ef:
         # higher ef leads to better accuracy, but slower search
@@ -51,7 +51,7 @@ def testRandomSelf(self):
         p.save_index(index_path)
         del p
 
-        # Reiniting, loading the index
+        # Re-initiating, loading the index
         p = hnswlib.Index(space='l2', dim=dim)  # you can change the sa
 
         print("\nLoading index from '%s'\n" % index_path)
diff --git a/python_bindings/tests/bindings_test_getdata.py b/python_bindings/tests/bindings_test_getdata.py
@@ -19,13 +19,13 @@ def testGettingItems(self):
         # Declaring index
         p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
 
-        # Initing index
+        # Initiating index
         # max_elements - the maximum number of elements, should be known beforehand
         #     (probably will be made optional in the future)
         #
         # ef_construction - controls index search speed/build speed tradeoff
         # M - is tightly connected with internal dimensionality of the data
-        #     stronlgy affects the memory consumption
+        #     strongly affects the memory consumption
 
         p.init_index(max_elements=num_elements, ef_construction=100, M=16)
 
diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py
@@ -21,13 +21,13 @@ def testRandomSelf(self):
             # Declaring index
             p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
 
-            # Initing index
+            # Initiating index
             # max_elements - the maximum number of elements, should be known beforehand
             #     (probably will be made optional in the future)
             #
             # ef_construction - controls index search speed/build speed tradeoff
             # M - is tightly connected with internal dimensionality of the data
-            #     stronlgy affects the memory consumption
+            #     strongly affects the memory consumption
 
             p.init_index(max_elements=num_elements, ef_construction=100, M=16)
 
@@ -47,7 +47,7 @@ def testRandomSelf(self):
             # Query the elements for themselves and measure recall:
             labels, distances = p.knn_query(data1, k=1)
 
-            items=p.get_items(labels)
+            items = p.get_items(labels)
 
             # Check the recall:
             self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)
@@ -67,8 +67,8 @@ def testRandomSelf(self):
             print("Deleted")
 
             print("\n**** Mark delete test ****\n")
-            # Reiniting, loading the index
-            print("Reiniting")
+            # Re-initiating, loading the index
+            print("Re-initiating")
             p = hnswlib.Index(space='l2', dim=dim)
 
             print("\nLoading index from '%s'\n" % index_path)
@@ -80,17 +80,17 @@ def testRandomSelf(self):
 
             # Query the elements for themselves and measure recall:
             labels, distances = p.knn_query(data, k=1)
-            items=p.get_items(labels)
+            items = p.get_items(labels)
 
             # Check the recall:
             self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)
 
             # Check that the returned element data is correct:
-            diff_with_gt_labels=np.mean(np.abs(data-items))
+            diff_with_gt_labels = np.mean(np.abs(data-items))
             self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index.
 
             # Checking that all labels are returned correctly:
-            sorted_labels=sorted(p.get_ids_list())
+            sorted_labels = sorted(p.get_ids_list())
             self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)
 
             # Delete data1
diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py
@@ -60,38 +60,38 @@ def test_space_main(self, space, dim):
 
     p.num_threads = self.num_threads  # by default using all available cores
 
-    p0 = pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index
+    p0 = pickle.loads(pickle.dumps(p)) # pickle un-initialized Index
     p.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M)
     p0.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M)
 
     p.ef = self.ef
     p0.ef = self.ef
 
-    p1 = pickle.loads(pickle.dumps(p)) ### pickle Index before adding items
+    p1 = pickle.loads(pickle.dumps(p)) # pickle Index before adding items
 
-    ### add items to ann index p,p0,p1
+    # add items to ann index p,p0,p1
     p.add_items(data)
     p1.add_items(data)
     p0.add_items(data)
 
-    p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items
+    p2=pickle.loads(pickle.dumps(p)) # pickle Index before adding items
 
     self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same")
     self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same")
     self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same")
 
-    ### Test if returned distances are same
+    # Test if returned distances are same
     l, d = p.knn_query(test_data, k=self.k)
     l0, d0 = p0.knn_query(test_data, k=self.k)
     l1, d1 = p1.knn_query(test_data, k=self.k)
     l2, d2 = p2.knn_query(test_data, k=self.k)
 
-    self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match")
-    self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match")
-    self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match")
+    self.assertLessEqual(np.sum(((d-d0)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match")
+    self.assertLessEqual(np.sum(((d0-d1)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match")
+    self.assertLessEqual(np.sum(((d1-d2)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match")
 
-    ### check if ann results match brute-force search
-    ###   allow for 2 labels to be missing from ann results
+    # check if ann results match brute-force search
+    #   allow for 2 labels to be missing from ann results
     check_ann_results(self, space, data, test_data, self.k, l, d,
                            err_thresh=self.label_err_thresh,
                            total_thresh=self.item_err_thresh,
@@ -102,19 +102,19 @@ def test_space_main(self, space, dim):
                            total_thresh=self.item_err_thresh,
                            dists_thresh=self.dists_err_thresh)
 
-    ### Check ef parameter value
+    # Check ef parameter value
     self.assertEqual(p.ef, self.ef, "incorrect value of p.ef")
     self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef")
     self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef")
     self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef")
 
-    ### Check M parameter value
+    # Check M parameter value
     self.assertEqual(p.M, self.M, "incorrect value of p.M")
     self.assertEqual(p0.M, self.M, "incorrect value of p0.M")
     self.assertEqual(p1.M, self.M, "incorrect value of p1.M")
     self.assertEqual(p2.M, self.M, "incorrect value of p2.M")
 
-    ### Check ef_construction parameter value
+    # Check ef_construction parameter value
     self.assertEqual(p.ef_construction, self.ef_construction, "incorrect value of p.ef_construction")
     self.assertEqual(p0.ef_construction, self.ef_construction, "incorrect value of p0.ef_construction")
     self.assertEqual(p1.ef_construction, self.ef_construction, "incorrect value of p1.ef_construction")
@@ -135,12 +135,12 @@ def setUp(self):
         self.num_threads = 4
         self.k = 25
 
-        self.label_err_thresh = 5  ### max number of missing labels allowed per test item
-        self.item_err_thresh = 5   ### max number of items allowed with incorrect labels
+        self.label_err_thresh = 5  # max number of missing labels allowed per test item
+        self.item_err_thresh = 5   # max number of items allowed with incorrect labels
 
-        self.dists_err_thresh = 50 ### for two matrices, d1 and d2, dists_err_thresh controls max
-                                 ### number of value pairs that are allowed to be different in d1 and d2
-                                 ### i.e., number of values that are (d1-d2)**2>1e-3
+        self.dists_err_thresh = 50 # for two matrices, d1 and d2, dists_err_thresh controls max
+                                 # number of value pairs that are allowed to be different in d1 and d2
+                                 # i.e., number of values that are (d1-d2)**2>1e-3
 
     def test_inner_product_space(self):
         test_space_main(self, 'ip', 48)
diff --git a/python_bindings/tests/bindings_test_resize.py b/python_bindings/tests/bindings_test_resize.py
@@ -7,71 +7,71 @@
 
 class RandomSelfTestCase(unittest.TestCase):
     def testRandomSelf(self):
-      for idx in range(16):
-        print("\n**** Index resize test ****\n")
+        for idx in range(16):
+            print("\n**** Index resize test ****\n")
 
-        np.random.seed(idx)
-        dim = 16
-        num_elements = 10000
+            np.random.seed(idx)
+            dim = 16
+            num_elements = 10000
 
-        # Generating sample data
-        data = np.float32(np.random.random((num_elements, dim)))
+            # Generating sample data
+            data = np.float32(np.random.random((num_elements, dim)))
 
-        # Declaring index
-        p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
+            # Declaring index
+            p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
 
-        # Initing index
-        # max_elements - the maximum number of elements, should be known beforehand
-        #     (probably will be made optional in the future)
-        #
-        # ef_construction - controls index search speed/build speed tradeoff
-        # M - is tightly connected with internal dimensionality of the data
-        #     stronlgy affects the memory consumption
+            # Initiating index
+            # max_elements - the maximum number of elements, should be known beforehand
+            #     (probably will be made optional in the future)
+            #
+            # ef_construction - controls index search speed/build speed tradeoff
+            # M - is tightly connected with internal dimensionality of the data
+            #     strongly affects the memory consumption
 
-        p.init_index(max_elements=num_elements//2, ef_construction=100, M=16)
+            p.init_index(max_elements=num_elements//2, ef_construction=100, M=16)
 
-        # Controlling the recall by setting ef:
-        # higher ef leads to better accuracy, but slower search
-        p.set_ef(20)
+            # Controlling the recall by setting ef:
+            # higher ef leads to better accuracy, but slower search
+            p.set_ef(20)
 
-        p.set_num_threads(idx%8)  # by default using all available cores
+            p.set_num_threads(idx % 8)  # by default using all available cores
 
-        # We split the data in two batches:
-        data1 = data[:num_elements // 2]
-        data2 = data[num_elements // 2:]
+            # We split the data in two batches:
+            data1 = data[:num_elements // 2]
+            data2 = data[num_elements // 2:]
 
-        print("Adding first batch of %d elements" % (len(data1)))
-        p.add_items(data1)
+            print("Adding first batch of %d elements" % (len(data1)))
+            p.add_items(data1)
 
-        # Query the elements for themselves and measure recall:
-        labels, distances = p.knn_query(data1, k=1)
+            # Query the elements for themselves and measure recall:
+            labels, distances = p.knn_query(data1, k=1)
 
-        items = p.get_items(list(range(len(data1))))
+            items = p.get_items(list(range(len(data1))))
 
-        # Check the recall:
-        self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)
+            # Check the recall:
+            self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)
 
-        # Check that the returned element data is correct:
-        diff_with_gt_labels = np.max(np.abs(data1-items))
-        self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)
+            # Check that the returned element data is correct:
+            diff_with_gt_labels = np.max(np.abs(data1-items))
+            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)
 
-        print("Resizing the index")
-        p.resize_index(num_elements)
+            print("Resizing the index")
+            p.resize_index(num_elements)
 
-        print("Adding the second batch of %d elements" % (len(data2)))
-        p.add_items(data2)
+            print("Adding the second batch of %d elements" % (len(data2)))
+            p.add_items(data2)
 
-        # Query the elements for themselves and measure recall:
-        labels, distances = p.knn_query(data, k=1)
-        items=p.get_items(list(range(num_elements)))
+            # Query the elements for themselves and measure recall:
+            labels, distances = p.knn_query(data, k=1)
+            items=p.get_items(list(range(num_elements)))
 
-        # Check the recall:
-        self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)
+            # Check the recall:
+            self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)
 
-        # Check that the returned element data is correct:
-        diff_with_gt_labels=np.max(np.abs(data-items))
-        self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)
+            # Check that the returned element data is correct:
+            diff_with_gt_labels = np.max(np.abs(data-items))
+            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)
 
-        # Checking that all labels are returned correcly:
-        sorted_labels=sorted(p.get_ids_list())
-        self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)
+            # Checking that all labels are returned correctly:
+            sorted_labels = sorted(p.get_ids_list())
+            self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)