@@ -121,6 +121,7 @@ def __init__(self, mapping_generation_values=None, seed=1):
121121 random .seed (seed )
122122
123123 # seed these
124+ # TODO: Should apply all of these: https://docs.opensearch.org/latest/mappings/supported-field-types/index/
124125 self .type_generators = {
125126 "text" : self .generate_text ,
126127 "keyword" : self .generate_keyword ,
@@ -136,6 +137,8 @@ def __init__(self, mapping_generation_values=None, seed=1):
136137 "object" : self .generate_object ,
137138 "nested" : self .generate_nested ,
138139 "geo_point" : self .generate_geo_point ,
140+ "knn_vector" : self .generate_knn_vector ,
141+ "sparse_vector" : self .generate_sparse_vector ,
139142 }
140143
141144 @staticmethod
@@ -258,6 +261,95 @@ def generate_nested(self, field_def: Dict[str, Any], **params) -> list:
258261 # Will be replaced by a list of nested objects
259262 return []
260263
264+ def generate_knn_vector (self , field_def : Dict [str , Any ], ** params ) -> list :
265+ """
266+ Generate dense vector embeddings for knn_vector field type.
267+ Supports both random generation and sample-based generation with noise for realistic clustering
268+
269+ Args:
270+ field_def: Field definition from mapping
271+ **params: Optional parameters:
272+ dimension: Vector dimensions. Can be retrieved from field_ef (default: 128)
273+ sample_vectors: List of base vectors to add noise to. Helps with realistic clustering.
274+ Without sample_vectors, OSB generates uniform random vectors between -1.0 and 1.0
275+ noise_factor: Standard deviation (gaussian) or range (uniform) of noise (default: 0.1)
276+ Lower values (0.01-0.05) create tight clusters.
277+ Higher values (0.2-0.5) create diverse distributions.
278+ distribution_type: Type of noise distribution (default: "gaussian").
279+ "gaussian": Normal distribution, realistic with outliers
280+ "uniform": Bounded distribution, predictable variation
281+ normalize: Whether to normalize the vector after generation (default: False)
282+ Set to True when using cosinesimil space_type in OpenSearch.
283+ Normalized vectors have magnitude = 1.0.
284+
285+ Returns:
286+ List of floats representing the dense vector.
287+ When using sample_vectors, creates a realistic variation around sampled clusters provided.
288+ Without sample_vectors, it uses random uniform values between -1.0 and 1.0.
289+ """
290+
291+ dims = field_def .get ("dimension" , params .get ("dimension" , 128 ))
292+ sample_vectors = params .get ("sample_vectors" , None )
293+
294+ if sample_vectors :
295+ noise_factor = params .get ("noise_factor" , 0.1 )
296+ distribution_type = params .get ("distribution_type" , "gaussian" )
297+ normalize = params .get ("normalize" , False )
298+
299+ # Pick random sample vector
300+ base_vector = random .choice (sample_vectors )
301+
302+ # Generate noise based on distribution type
303+ if distribution_type == "gaussian" :
304+ noise = [random .gauss (0 , noise_factor ) for _ in range (dims )]
305+ else : # uniform
306+ noise = [random .uniform (- noise_factor , noise_factor ) for _ in range (dims )]
307+
308+ # Add noise to base vector
309+ vector = [base_vector [i ] + noise [i ] for i in range (dims )]
310+
311+ # Normalize if requested
312+ if normalize :
313+ magnitude = sum (x ** 2 for x in vector ) ** 0.5
314+ if magnitude > 0 :
315+ vector = [x / magnitude for x in vector ]
316+
317+ return vector
318+
319+ else :
320+ # Fallback to random generation with each dimension being between -1 and 1
321+ return [random .uniform (- 1.0 , 1.0 ) for _ in range (dims )]
322+
323+ def generate_sparse_vector (self , field_def : Dict [str , Any ], ** params ) -> Dict [str , float ]:
324+ """
325+ Generate sparse vector as token_id -> weight pairs for sparse_vector field type.
326+
327+ Args:
328+ field_def: Field definition from mapping
329+ **params: The following are optional parameters:
330+ num_tokens: Number of token-weight pairs (default: 10)
331+ min_weight: Minimum weight value (default: 0.01)
332+ max_weight: Maximum weight value (default: 1.0)
333+ token_id_start: Starting token ID (default: 1000)
334+ token_id_step: Step between token IDs (default: 100)
335+
336+ Returns:
337+ Dict of token_id -> weight pairs with positive float values
338+ """
339+ num_tokens = params .get ('num_tokens' , 10 )
340+ min_weight = params .get ('min_weight' , 0.01 )
341+ max_weight = params .get ('max_weight' , 1.0 )
342+ token_id_start = params .get ('token_id_start' , 1000 )
343+ token_id_step = params .get ('token_id_step' , 100 )
344+
345+ sparse_vector = {}
346+ for i in range (num_tokens ):
347+ token_id = str (token_id_start + (i * token_id_step ))
348+ weight = random .uniform (min_weight , max_weight )
349+ sparse_vector [token_id ] = round (weight , 4 ) # imitate real neural sparse search models like Splade and DeepImpact
350+
351+ return sparse_vector
352+
261353 def transform_mapping_to_generators (self , mapping_dict : Dict [str , Any ], field_path_prefix = "" ) -> Dict [str , Callable [[], Any ]]:
262354 """
263355 Transforms an OpenSearch mapping into a dictionary of field names mapped to generator functions.
0 commit comments