Fix training_data bug in test_ingest_with_training_source_uri_tdb and validate training data dimensions in ingest() (#175)

jparismorgan · web-flow · commit f5734f9f4e15 · 2023-12-20T14:20:51.000+01:00
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -157,7 +157,13 @@ def ingest(
         raise ValueError("training_source_type should not be provided without training_source_uri")
     
     if training_sample_size < -1:
-        raise ValueError("training_sample_size should either be positive or -1 to auto-configure based on the dataset sizes")
+        raise ValueError("training_sample_size should either be positive or -1 (to auto-configure based on the dataset sizes)")
+
+    if index_type != "IVF_FLAT" and training_sample_size != -1:
+        raise ValueError("training_sample_size should only be provided with index_type IVF_FLAT")
+    for variable in ["copy_centroids_uri", "training_input_vectors", "training_source_uri", "training_source_type"]:
+        if index_type != "IVF_FLAT" and locals().get(variable) is not None:
+            raise ValueError(f"{variable} should only be provided with index_type IVF_FLAT")
 
     # use index_group_uri for internal clarity
     index_group_uri = index_uri
@@ -820,12 +826,13 @@ def centralised_kmeans(
                     if training_source_type is None:
                         training_source_type = autodetect_source_type(source_uri=training_source_uri)
                     training_in_size, training_dimensions, training_vector_type = read_source_metadata(source_uri=training_source_uri, source_type=training_source_type)
-                    dimensions = training_dimensions
+                    if dimensions != training_dimensions:
+                        raise ValueError(f"When training centroids, the index data dimensions ({dimensions}) != the training data dimensions ({training_dimensions})")
                     sample_vectors = read_input_vectors(
                         source_uri=training_source_uri,
                         source_type=training_source_type,
                         vector_type=training_vector_type,
-                        dimensions=training_dimensions,
+                        dimensions=dimensions,
                         start_pos=0,
                         end_pos=training_in_size,
                         config=config,
@@ -2076,4 +2083,4 @@ def consolidate_and_vacuum(
         elif index_type == "IVF_FLAT":
             return ivf_flat_index.IVFFlatIndex(
                 uri=index_group_uri, memory_budget=1000000, config=config
-            )
+            )
diff --git a/apis/python/test/common.py b/apis/python/test/common.py
@@ -295,8 +295,8 @@ def check_equals(result_d, result_i, expected_result_d, expected_result_i):
     result_i_expected: int
         The expected indices
     """
-    assert result_i == expected_result_i
-    assert result_d == expected_result_d
+    assert result_i == expected_result_i, f"result_i: {result_i} != expected_result_i: {expected_result_i}"
+    assert result_d == expected_result_d, f"result_d: {result_d} != expected_result_d: {expected_result_d}"
 
 # Generate random names for test array uris
 def random_name(name: str) -> str:
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -886,15 +886,43 @@ def test_ingest_with_training_source_uri_f32(tmp_path):
     )
 
 def test_ingest_with_training_source_uri_tdb(tmp_path):
+    ################################################################################################
+    # First set up the data.
+    ################################################################################################
     dataset_dir = os.path.join(tmp_path, "dataset")
     os.mkdir(dataset_dir)
     # data.shape should give you (cols, rows). So we transpose this before using it.
-    data = np.array([[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3], [3.0, 3.1, 3.2, 3.3], [4.0, 4.1, 4.2, 4.3], [5.0, 5.1, 5.2, 5.3]], dtype=np.float32).transpose()
+    data = np.array([
+        [1.0, 1.1, 1.2, 1.3], 
+        [2.0, 2.1, 2.2, 2.3], 
+        [3.0, 3.1, 3.2, 3.3], 
+        [4.0, 4.1, 4.2, 4.3], 
+        [5.0, 5.1, 5.2, 5.3]], dtype=np.float32).transpose()
     create_array(path=os.path.join(dataset_dir, "data.tdb"), data=data)
 
-    training_data = data[1:3]
+    training_data = np.array([
+        [1.0, 1.1, 1.2, 1.3], 
+        [5.0, 5.1, 5.2, 5.3]], dtype=np.float32).transpose()
     create_array(path=os.path.join(dataset_dir, "training_data.tdb"), data=training_data)
 
+    # Run a quick test that if we set up training_data incorrectly, we will raise an exception.
+    with pytest.raises(ValueError) as error:
+        training_data_invalid = np.array([
+            [1.0, 1.1, 1.2], 
+            [5.0, 5.1, 5.2]], dtype=np.float32).transpose()
+        create_array(path=os.path.join(dataset_dir, "training_data_invalid.tdb"), data=training_data_invalid)
+        index = ingest(
+            index_type="IVF_FLAT", 
+            index_uri=os.path.join(tmp_path, f"array_invalid"), 
+            source_uri=os.path.join(dataset_dir, "data.tdb"),
+            training_source_uri=os.path.join(dataset_dir, "training_data_invalid.tdb")
+        )
+    assert "training data dimensions" in str(error.value)
+
+    ################################################################################################
+    # Test we can ingest, query, update, and consolidate with a training_source_uri.
+    ################################################################################################
+    print('[test_ingestion@test_ingest_with_training_source_uri_tdb] ingest() ======================================')
     index_uri = os.path.join(tmp_path, "array")
     index = ingest(
         index_type="IVF_FLAT", 
@@ -922,9 +950,33 @@ def test_ingest_with_training_source_uri_tdb(tmp_path):
     )
 
 def test_ingest_with_training_source_uri_numpy(tmp_path):
-    data = np.array([[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3], [3.0, 3.1, 3.2, 3.3], [4.0, 4.1, 4.2, 4.3], [5.0, 5.1, 5.2, 5.3]], dtype=np.float32)
+    ################################################################################################
+    # First set up the data.
+    ################################################################################################
+    data = np.array([
+        [1.0, 1.1, 1.2, 1.3], 
+        [2.0, 2.1, 2.2, 2.3], 
+        [3.0, 3.1, 3.2, 3.3], 
+        [4.0, 4.1, 4.2, 4.3], 
+        [5.0, 5.1, 5.2, 5.3]], dtype=np.float32)
     training_data = data[1:3]
 
+    # Run a quick test that if we set up training_data incorrectly, we will raise an exception.
+    with pytest.raises(ValueError) as error:
+        training_data_invalid = np.array([
+            [4.0, 4.1, 4.2], 
+            [5.0, 5.1, 5.2]], dtype=np.float32)
+        index = ingest(
+            index_type="IVF_FLAT", 
+            index_uri=os.path.join(tmp_path, "array_invalid"), 
+            input_vectors=data,
+            training_input_vectors=training_data_invalid,
+        )
+    assert "training data dimensions" in str(error.value)
+
+    ################################################################################################
+    # Test we can ingest, query, update, and consolidate.
+    ################################################################################################
     index_uri = os.path.join(tmp_path, "array")
     index = ingest(
         index_type="IVF_FLAT",