diff --git a/osbenchmark/synthetic_data_generator/models.py b/osbenchmark/synthetic_data_generator/models.py index 2e95104d..8ada83c6 100644 --- a/osbenchmark/synthetic_data_generator/models.py +++ b/osbenchmark/synthetic_data_generator/models.py @@ -101,6 +101,25 @@ class GeneratorParams(BaseModel): # Text / Keywords Params must_include: Optional[List[str]] = None choices: Optional[List[str]] = None + min_words: Optional[int] = None + max_words: Optional[int] = None + + # Float / Double Params + precision: Optional[int] = None + + # knn_vector Params + dimension: Optional[int] = None + sample_vectors: Optional[List[List[float]]] = None + noise_factor: Optional[float] = None + distribution_type: Optional[str] = None + normalize: Optional[bool] = None + + # sparse_vector Params + num_tokens: Optional[int] = None + min_weight: Optional[float] = None + max_weight: Optional[float] = None + token_id_start: Optional[int] = None + token_id_step: Optional[int] = None class Config: extra = 'forbid' @@ -124,9 +143,11 @@ def validate_generator_name(cls, v): 'generate_boolean', 'generate_date', 'generate_ip', - 'generate_geopoint', + 'generate_geo_point', 'generate_object', - 'generate_nested' + 'generate_nested', + 'generate_knn_vector', + 'generate_sparse_vector' ] if v not in valid_generators: @@ -140,8 +161,23 @@ class MappingGenerationValuesConfig(BaseModel): # pylint: disable = no-self-argument @field_validator('generator_overrides') def validate_generator_types(cls, v): + # Based on this documentation from OpenSearch: https://docs.opensearch.org/latest/mappings/supported-field-types/index/ + # TODO: Add more support for if v is not None: - valid_generator_types = ['integer', 'long', 'float', 'double', 'date', 'text', 'keyword', 'short', 'byte', 'ip', 'geopoint', 'nested', 'boolean'] + supported_mapping_field_types = { + 'core-field-types': ['boolean'], + 'string-based-field-types': ['text', 'keyword'], + 'numeric-field-types': ['byte', 'short', 'integer', 'long', 'float', 'double'], + 'date-time-field-types': ['date'], + 'ip-field-types': ['ip'], + 'geographic-field-types': ['geo_point'], + 'object-field-types': ['object', 'nested'], + 'vector-field-types': ['knn_vector', 'sparse_vector'] + } + valid_generator_types = [] + + for field_types in supported_mapping_field_types.values(): + valid_generator_types.extend(field_types) for generator_type in v.keys(): if generator_type not in valid_generator_types: diff --git a/osbenchmark/synthetic_data_generator/strategies/mapping_strategy.py b/osbenchmark/synthetic_data_generator/strategies/mapping_strategy.py index 44f05c9b..f98d70a1 100644 --- a/osbenchmark/synthetic_data_generator/strategies/mapping_strategy.py +++ b/osbenchmark/synthetic_data_generator/strategies/mapping_strategy.py @@ -121,6 +121,7 @@ def __init__(self, mapping_generation_values=None, seed=1): random.seed(seed) # seed these + # TODO: Should apply all of these: https://docs.opensearch.org/latest/mappings/supported-field-types/index/ self.type_generators = { "text": self.generate_text, "keyword": self.generate_keyword, @@ -136,6 +137,8 @@ def __init__(self, mapping_generation_values=None, seed=1): "object": self.generate_object, "nested": self.generate_nested, "geo_point": self.generate_geo_point, + "knn_vector": self.generate_knn_vector, + "sparse_vector": self.generate_sparse_vector, } @staticmethod @@ -258,6 +261,95 @@ def generate_nested(self, field_def: Dict[str, Any], **params) -> list: # Will be replaced by a list of nested objects return [] + def generate_knn_vector(self, field_def: Dict[str, Any], **params) -> list: + """ + Generate dense vector embeddings for knn_vector field type. + Supports both random generation and sample-based generation with noise for realistic clustering + + Args: + field_def: Field definition from mapping + **params: Optional parameters: + dimension: Vector dimensions. Can be retrieved from field_ef (default: 128) + sample_vectors: List of base vectors to add noise to. Helps with realistic clustering. + Without sample_vectors, OSB generates uniform random vectors between -1.0 and 1.0 + noise_factor: Standard deviation (gaussian) or range (uniform) of noise (default: 0.1) + Lower values (0.01-0.05) create tight clusters. + Higher values (0.2-0.5) create diverse distributions. + distribution_type: Type of noise distribution (default: "gaussian"). + "gaussian": Normal distribution, realistic with outliers + "uniform": Bounded distribution, predictable variation + normalize: Whether to normalize the vector after generation (default: False) + Set to True when using cosinesimil space_type in OpenSearch. + Normalized vectors have magnitude = 1.0. + + Returns: + List of floats representing the dense vector. + When using sample_vectors, creates a realistic variation around sampled clusters provided. + Without sample_vectors, it uses random uniform values between -1.0 and 1.0. + """ + + dims = field_def.get("dimension", params.get("dimension", 128)) + sample_vectors = params.get("sample_vectors", None) + + if sample_vectors: + noise_factor = params.get("noise_factor", 0.1) + distribution_type = params.get("distribution_type", "gaussian") + normalize = params.get("normalize", False) + + # Pick random sample vector + base_vector = random.choice(sample_vectors) + + # Generate noise based on distribution type + if distribution_type == "gaussian": + noise = [random.gauss(0, noise_factor) for _ in range(dims)] + else: # uniform + noise = [random.uniform(-noise_factor, noise_factor) for _ in range(dims)] + + # Add noise to base vector + vector = [base_vector[i] + noise[i] for i in range(dims)] + + # Normalize if requested + if normalize: + magnitude = sum(x**2 for x in vector) ** 0.5 + if magnitude > 0: + vector = [x / magnitude for x in vector] + + return vector + + else: + # Fallback to random generation with each dimension being between -1 and 1 + return [random.uniform(-1.0, 1.0) for _ in range(dims)] + + def generate_sparse_vector(self, field_def: Dict[str, Any], **params) -> Dict[str, float]: + """ + Generate sparse vector as token_id -> weight pairs for sparse_vector field type. + + Args: + field_def: Field definition from mapping + **params: The following are optional parameters: + num_tokens: Number of token-weight pairs (default: 10) + min_weight: Minimum weight value (default: 0.01) + max_weight: Maximum weight value (default: 1.0) + token_id_start: Starting token ID (default: 1000) + token_id_step: Step between token IDs (default: 100) + + Returns: + Dict of token_id -> weight pairs with positive float values + """ + num_tokens = params.get('num_tokens', 10) + min_weight = params.get('min_weight', 0.01) + max_weight = params.get('max_weight', 1.0) + token_id_start = params.get('token_id_start', 1000) + token_id_step = params.get('token_id_step', 100) + + sparse_vector = {} + for i in range(num_tokens): + token_id = str(token_id_start + (i * token_id_step)) + weight = random.uniform(min_weight, max_weight) + sparse_vector[token_id] = round(weight, 4) # imitate real neural sparse search models like Splade and DeepImpact + + return sparse_vector + def transform_mapping_to_generators(self, mapping_dict: Dict[str, Any], field_path_prefix="") -> Dict[str, Callable[[], Any]]: """ Transforms an OpenSearch mapping into a dictionary of field names mapped to generator functions. diff --git a/tests/synthetic_data_generator/strategies_test.py b/tests/synthetic_data_generator/strategies_test.py index 5401820f..b81b5de9 100644 --- a/tests/synthetic_data_generator/strategies_test.py +++ b/tests/synthetic_data_generator/strategies_test.py @@ -1285,3 +1285,100 @@ def test_generating_documents_for_with_overrides(self, mapping_converter): assert field in document assert document["id"] in ["Helly R", "Mark S", "Irving B"] + + def test_generate_sparse_vector(self, mapping_converter): + """Test basic sparse_vector generation""" + mapping = { + "properties": { + "sparse_embedding": { + "type": "sparse_vector" + } + } + } + + generators = mapping_converter.transform_mapping_to_generators(mapping) + document = MappingConverter.generate_synthetic_document(generators) + + # Validate field exists + assert "sparse_embedding" in document + + # Validate it's a dictionary + assert isinstance(document["sparse_embedding"], dict) + + # Validate structure: string keys (token IDs), float values (weights) + for token_id, weight in document["sparse_embedding"].items(): + assert isinstance(token_id, str) + assert token_id.isdigit() # Token IDs should be numeric strings + assert isinstance(weight, (int, float)) + assert weight > 0 # Weights must be positive + + def test_generate_sparse_vector_with_params(self, mapping_converter): + """Test sparse_vector generation with custom parameters""" + # Override to use custom params + mapping_converter.mapping_config = { + "generator_overrides": { + "sparse_vector": { + "num_tokens": 5, + "min_weight": 0.1, + "max_weight": 0.9, + "token_id_start": 5000, + "token_id_step": 50 + } + }, + "field_overrides": {} + } + + mapping = { + "properties": { + "embedding": { + "type": "sparse_vector" + } + } + } + + generators = mapping_converter.transform_mapping_to_generators(mapping) + document = MappingConverter.generate_synthetic_document(generators) + + # Should have exactly 5 tokens + assert len(document["embedding"]) == 5 + + # Validate token ID range: should be 5000, 5050, 5100, 5150, 5200 + expected_token_ids = {"5000", "5050", "5100", "5150", "5200"} + assert set(document["embedding"].keys()) == expected_token_ids + + # All weights should be in range [0.1, 0.9] + for weight in document["embedding"].values(): + assert 0.1 <= weight <= 0.9 + + def test_generate_sparse_vector_in_complex_mapping(self, mapping_converter): + """Test sparse_vector within a complex mapping alongside other field types""" + mapping = { + "properties": { + "text": {"type": "text"}, + "dense_vector": {"type": "knn_vector", "dimension": 3}, + "sparse_vector": {"type": "sparse_vector"}, + "metadata": { + "type": "object", + "properties": { + "id": {"type": "keyword"} + } + } + } + } + + generators = mapping_converter.transform_mapping_to_generators(mapping) + document = MappingConverter.generate_synthetic_document(generators) + + # Validate all fields generated + assert "text" in document + assert "dense_vector" in document + assert "sparse_vector" in document + assert "metadata" in document + + # Validate sparse_vector is correct format + assert isinstance(document["sparse_vector"], dict) + assert len(document["sparse_vector"]) > 0 + + # Validate dense_vector is different from sparse_vector + assert isinstance(document["dense_vector"], list) + assert len(document["dense_vector"]) == 3