Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions osbenchmark/synthetic_data_generator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,25 @@ class GeneratorParams(BaseModel):
# Text / Keywords Params
must_include: Optional[List[str]] = None
choices: Optional[List[str]] = None
min_words: Optional[int] = None
max_words: Optional[int] = None

# Float / Double Params
precision: Optional[int] = None

# knn_vector Params
dimension: Optional[int] = None
sample_vectors: Optional[List[List[float]]] = None
noise_factor: Optional[float] = None
distribution_type: Optional[str] = None
normalize: Optional[bool] = None

# sparse_vector Params
num_tokens: Optional[int] = None
min_weight: Optional[float] = None
max_weight: Optional[float] = None
token_id_start: Optional[int] = None
token_id_step: Optional[int] = None

class Config:
extra = 'forbid'
Expand All @@ -124,9 +143,11 @@ def validate_generator_name(cls, v):
'generate_boolean',
'generate_date',
'generate_ip',
'generate_geopoint',
'generate_geo_point',
'generate_object',
'generate_nested'
'generate_nested',
'generate_knn_vector',
'generate_sparse_vector'
]

if v not in valid_generators:
Expand All @@ -140,8 +161,23 @@ class MappingGenerationValuesConfig(BaseModel):
# pylint: disable = no-self-argument
@field_validator('generator_overrides')
def validate_generator_types(cls, v):
# Based on this documentation from OpenSearch: https://docs.opensearch.org/latest/mappings/supported-field-types/index/
# TODO: Add more support for
if v is not None:
valid_generator_types = ['integer', 'long', 'float', 'double', 'date', 'text', 'keyword', 'short', 'byte', 'ip', 'geopoint', 'nested', 'boolean']
supported_mapping_field_types = {
'core-field-types': ['boolean'],
'string-based-field-types': ['text', 'keyword'],
'numeric-field-types': ['byte', 'short', 'integer', 'long', 'float', 'double'],
'date-time-field-types': ['date'],
'ip-field-types': ['ip'],
'geographic-field-types': ['geo_point'],
'object-field-types': ['object', 'nested'],
'vector-field-types': ['knn_vector', 'sparse_vector']
}
valid_generator_types = []

for field_types in supported_mapping_field_types.values():
valid_generator_types.extend(field_types)

for generator_type in v.keys():
if generator_type not in valid_generator_types:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def __init__(self, mapping_generation_values=None, seed=1):
random.seed(seed)

# seed these
# TODO: Should apply all of these: https://docs.opensearch.org/latest/mappings/supported-field-types/index/
self.type_generators = {
"text": self.generate_text,
"keyword": self.generate_keyword,
Expand All @@ -136,6 +137,8 @@ def __init__(self, mapping_generation_values=None, seed=1):
"object": self.generate_object,
"nested": self.generate_nested,
"geo_point": self.generate_geo_point,
"knn_vector": self.generate_knn_vector,
"sparse_vector": self.generate_sparse_vector,
}

@staticmethod
Expand Down Expand Up @@ -258,6 +261,95 @@ def generate_nested(self, field_def: Dict[str, Any], **params) -> list:
# Will be replaced by a list of nested objects
return []

def generate_knn_vector(self, field_def: Dict[str, Any], **params) -> list:
"""
Generate dense vector embeddings for knn_vector field type.
Supports both random generation and sample-based generation with noise for realistic clustering

Args:
field_def: Field definition from mapping
**params: Optional parameters:
dimension: Vector dimensions. Can be retrieved from field_ef (default: 128)
sample_vectors: List of base vectors to add noise to. Helps with realistic clustering.
Without sample_vectors, OSB generates uniform random vectors between -1.0 and 1.0
noise_factor: Standard deviation (gaussian) or range (uniform) of noise (default: 0.1)
Lower values (0.01-0.05) create tight clusters.
Higher values (0.2-0.5) create diverse distributions.
distribution_type: Type of noise distribution (default: "gaussian").
"gaussian": Normal distribution, realistic with outliers
"uniform": Bounded distribution, predictable variation
normalize: Whether to normalize the vector after generation (default: False)
Set to True when using cosinesimil space_type in OpenSearch.
Normalized vectors have magnitude = 1.0.

Returns:
List of floats representing the dense vector.
When using sample_vectors, creates a realistic variation around sampled clusters provided.
Without sample_vectors, it uses random uniform values between -1.0 and 1.0.
"""

dims = field_def.get("dimension", params.get("dimension", 128))
sample_vectors = params.get("sample_vectors", None)

if sample_vectors:
noise_factor = params.get("noise_factor", 0.1)
distribution_type = params.get("distribution_type", "gaussian")
normalize = params.get("normalize", False)

# Pick random sample vector
base_vector = random.choice(sample_vectors)

# Generate noise based on distribution type
if distribution_type == "gaussian":
noise = [random.gauss(0, noise_factor) for _ in range(dims)]
else: # uniform
noise = [random.uniform(-noise_factor, noise_factor) for _ in range(dims)]

# Add noise to base vector
vector = [base_vector[i] + noise[i] for i in range(dims)]

# Normalize if requested
if normalize:
magnitude = sum(x**2 for x in vector) ** 0.5
if magnitude > 0:
vector = [x / magnitude for x in vector]

return vector

else:
# Fallback to random generation with each dimension being between -1 and 1
return [random.uniform(-1.0, 1.0) for _ in range(dims)]

def generate_sparse_vector(self, field_def: Dict[str, Any], **params) -> Dict[str, float]:
"""
Generate sparse vector as token_id -> weight pairs for sparse_vector field type.

Args:
field_def: Field definition from mapping
**params: The following are optional parameters:
num_tokens: Number of token-weight pairs (default: 10)
min_weight: Minimum weight value (default: 0.01)
max_weight: Maximum weight value (default: 1.0)
token_id_start: Starting token ID (default: 1000)
token_id_step: Step between token IDs (default: 100)

Returns:
Dict of token_id -> weight pairs with positive float values
"""
num_tokens = params.get('num_tokens', 10)
min_weight = params.get('min_weight', 0.01)
max_weight = params.get('max_weight', 1.0)
token_id_start = params.get('token_id_start', 1000)
token_id_step = params.get('token_id_step', 100)

sparse_vector = {}
for i in range(num_tokens):
token_id = str(token_id_start + (i * token_id_step))
weight = random.uniform(min_weight, max_weight)
sparse_vector[token_id] = round(weight, 4) # imitate real neural sparse search models like Splade and DeepImpact

return sparse_vector

def transform_mapping_to_generators(self, mapping_dict: Dict[str, Any], field_path_prefix="") -> Dict[str, Callable[[], Any]]:
"""
Transforms an OpenSearch mapping into a dictionary of field names mapped to generator functions.
Expand Down
97 changes: 97 additions & 0 deletions tests/synthetic_data_generator/strategies_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1285,3 +1285,100 @@ def test_generating_documents_for_with_overrides(self, mapping_converter):
assert field in document

assert document["id"] in ["Helly R", "Mark S", "Irving B"]

def test_generate_sparse_vector(self, mapping_converter):
"""Test basic sparse_vector generation"""
mapping = {
"properties": {
"sparse_embedding": {
"type": "sparse_vector"
}
}
}

generators = mapping_converter.transform_mapping_to_generators(mapping)
document = MappingConverter.generate_synthetic_document(generators)

# Validate field exists
assert "sparse_embedding" in document

# Validate it's a dictionary
assert isinstance(document["sparse_embedding"], dict)

# Validate structure: string keys (token IDs), float values (weights)
for token_id, weight in document["sparse_embedding"].items():
assert isinstance(token_id, str)
assert token_id.isdigit() # Token IDs should be numeric strings
assert isinstance(weight, (int, float))
assert weight > 0 # Weights must be positive

def test_generate_sparse_vector_with_params(self, mapping_converter):
"""Test sparse_vector generation with custom parameters"""
# Override to use custom params
mapping_converter.mapping_config = {
"generator_overrides": {
"sparse_vector": {
"num_tokens": 5,
"min_weight": 0.1,
"max_weight": 0.9,
"token_id_start": 5000,
"token_id_step": 50
}
},
"field_overrides": {}
}

mapping = {
"properties": {
"embedding": {
"type": "sparse_vector"
}
}
}

generators = mapping_converter.transform_mapping_to_generators(mapping)
document = MappingConverter.generate_synthetic_document(generators)

# Should have exactly 5 tokens
assert len(document["embedding"]) == 5

# Validate token ID range: should be 5000, 5050, 5100, 5150, 5200
expected_token_ids = {"5000", "5050", "5100", "5150", "5200"}
assert set(document["embedding"].keys()) == expected_token_ids

# All weights should be in range [0.1, 0.9]
for weight in document["embedding"].values():
assert 0.1 <= weight <= 0.9

def test_generate_sparse_vector_in_complex_mapping(self, mapping_converter):
"""Test sparse_vector within a complex mapping alongside other field types"""
mapping = {
"properties": {
"text": {"type": "text"},
"dense_vector": {"type": "knn_vector", "dimension": 3},
"sparse_vector": {"type": "sparse_vector"},
"metadata": {
"type": "object",
"properties": {
"id": {"type": "keyword"}
}
}
}
}

generators = mapping_converter.transform_mapping_to_generators(mapping)
document = MappingConverter.generate_synthetic_document(generators)

# Validate all fields generated
assert "text" in document
assert "dense_vector" in document
assert "sparse_vector" in document
assert "metadata" in document

# Validate sparse_vector is correct format
assert isinstance(document["sparse_vector"], dict)
assert len(document["sparse_vector"]) > 0

# Validate dense_vector is different from sparse_vector
assert isinstance(document["dense_vector"], list)
assert len(document["dense_vector"]) == 3
Loading