Skip to content

Commit 42ea8ab

Browse files
committed
Add support for dense and sparse vectors in synthetic data generation.
Signed-off-by: Ian Hoang <[email protected]>
1 parent 7512238 commit 42ea8ab

File tree

3 files changed

+228
-3
lines changed

3 files changed

+228
-3
lines changed

osbenchmark/synthetic_data_generator/models.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,25 @@ class GeneratorParams(BaseModel):
101101
# Text / Keywords Params
102102
must_include: Optional[List[str]] = None
103103
choices: Optional[List[str]] = None
104+
min_words: Optional[int] = None
105+
max_words: Optional[int] = None
106+
107+
# Float / Double Params
108+
precision: Optional[int] = None
109+
110+
# knn_vector Params
111+
dimension: Optional[int] = None
112+
sample_vectors: Optional[List[List[float]]] = None
113+
noise_factor: Optional[float] = None
114+
distribution_type: Optional[str] = None
115+
normalize: Optional[bool] = None
116+
117+
# sparse_vector Params
118+
num_tokens: Optional[int] = None
119+
min_weight: Optional[float] = None
120+
max_weight: Optional[float] = None
121+
token_id_start: Optional[int] = None
122+
token_id_step: Optional[int] = None
104123

105124
class Config:
106125
extra = 'forbid'
@@ -124,9 +143,11 @@ def validate_generator_name(cls, v):
124143
'generate_boolean',
125144
'generate_date',
126145
'generate_ip',
127-
'generate_geopoint',
146+
'generate_geo_point',
128147
'generate_object',
129-
'generate_nested'
148+
'generate_nested',
149+
'generate_knn_vector',
150+
'generate_sparse_vector'
130151
]
131152

132153
if v not in valid_generators:
@@ -140,8 +161,23 @@ class MappingGenerationValuesConfig(BaseModel):
140161
# pylint: disable = no-self-argument
141162
@field_validator('generator_overrides')
142163
def validate_generator_types(cls, v):
164+
# Based on this documentation from OpenSearch: https://docs.opensearch.org/latest/mappings/supported-field-types/index/
165+
# TODO: Add more support for
143166
if v is not None:
144-
valid_generator_types = ['integer', 'long', 'float', 'double', 'date', 'text', 'keyword', 'short', 'byte', 'ip', 'geopoint', 'nested', 'boolean']
167+
supported_mapping_field_types = {
168+
'core-field-types': ['boolean'],
169+
'string-based-field-types': ['text', 'keyword'],
170+
'numeric-field-types': ['byte', 'short', 'integer', 'long', 'float', 'double'],
171+
'date-time-field-types': ['date'],
172+
'ip-field-types': ['ip'],
173+
'geographic-field-types': ['geo_point'],
174+
'object-field-types': ['object', 'nested'],
175+
'vector-field-types': ['knn_vector', 'sparse_vector']
176+
}
177+
valid_generator_types = []
178+
179+
for field_types in supported_mapping_field_types.values():
180+
valid_generator_types.extend(field_types)
145181

146182
for generator_type in v.keys():
147183
if generator_type not in valid_generator_types:

osbenchmark/synthetic_data_generator/strategies/mapping_strategy.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ def __init__(self, mapping_generation_values=None, seed=1):
121121
random.seed(seed)
122122

123123
# seed these
124+
# TODO: Should apply all of these: https://docs.opensearch.org/latest/mappings/supported-field-types/index/
124125
self.type_generators = {
125126
"text": self.generate_text,
126127
"keyword": self.generate_keyword,
@@ -136,6 +137,8 @@ def __init__(self, mapping_generation_values=None, seed=1):
136137
"object": self.generate_object,
137138
"nested": self.generate_nested,
138139
"geo_point": self.generate_geo_point,
140+
"knn_vector": self.generate_knn_vector,
141+
"sparse_vector": self.generate_sparse_vector,
139142
}
140143

141144
@staticmethod
@@ -258,6 +261,95 @@ def generate_nested(self, field_def: Dict[str, Any], **params) -> list:
258261
# Will be replaced by a list of nested objects
259262
return []
260263

264+
def generate_knn_vector(self, field_def: Dict[str, Any], **params) -> list:
265+
"""
266+
Generate dense vector embeddings for knn_vector field type.
267+
Supports both random generation and sample-based generation with noise for realistic clustering
268+
269+
Args:
270+
field_def: Field definition from mapping
271+
**params: Optional parameters:
272+
dimension: Vector dimensions. Can be retrieved from field_ef (default: 128)
273+
sample_vectors: List of base vectors to add noise to. Helps with realistic clustering.
274+
Without sample_vectors, OSB generates uniform random vectors between -1.0 and 1.0
275+
noise_factor: Standard deviation (gaussian) or range (uniform) of noise (default: 0.1)
276+
Lower values (0.01-0.05) create tight clusters.
277+
Higher values (0.2-0.5) create diverse distributions.
278+
distribution_type: Type of noise distribution (default: "gaussian").
279+
"gaussian": Normal distribution, realistic with outliers
280+
"uniform": Bounded distribution, predictable variation
281+
normalize: Whether to normalize the vector after generation (default: False)
282+
Set to True when using cosinesimil space_type in OpenSearch.
283+
Normalized vectors have magnitude = 1.0.
284+
285+
Returns:
286+
List of floats representing the dense vector.
287+
When using sample_vectors, creates a realistic variation around sampled clusters provided.
288+
Without sample_vectors, it uses random uniform values between -1.0 and 1.0.
289+
"""
290+
291+
dims = field_def.get("dimension", params.get("dimension", 128))
292+
sample_vectors = params.get("sample_vectors", None)
293+
294+
if sample_vectors:
295+
noise_factor = params.get("noise_factor", 0.1)
296+
distribution_type = params.get("distribution_type", "gaussian")
297+
normalize = params.get("normalize", False)
298+
299+
# Pick random sample vector
300+
base_vector = random.choice(sample_vectors)
301+
302+
# Generate noise based on distribution type
303+
if distribution_type == "gaussian":
304+
noise = [random.gauss(0, noise_factor) for _ in range(dims)]
305+
else: # uniform
306+
noise = [random.uniform(-noise_factor, noise_factor) for _ in range(dims)]
307+
308+
# Add noise to base vector
309+
vector = [base_vector[i] + noise[i] for i in range(dims)]
310+
311+
# Normalize if requested
312+
if normalize:
313+
magnitude = sum(x**2 for x in vector) ** 0.5
314+
if magnitude > 0:
315+
vector = [x / magnitude for x in vector]
316+
317+
return vector
318+
319+
else:
320+
# Fallback to random generation with each dimension being between -1 and 1
321+
return [random.uniform(-1.0, 1.0) for _ in range(dims)]
322+
323+
def generate_sparse_vector(self, field_def: Dict[str, Any], **params) -> Dict[str, float]:
324+
"""
325+
Generate sparse vector as token_id -> weight pairs for sparse_vector field type.
326+
327+
Args:
328+
field_def: Field definition from mapping
329+
**params: The following are optional parameters:
330+
num_tokens: Number of token-weight pairs (default: 10)
331+
min_weight: Minimum weight value (default: 0.01)
332+
max_weight: Maximum weight value (default: 1.0)
333+
token_id_start: Starting token ID (default: 1000)
334+
token_id_step: Step between token IDs (default: 100)
335+
336+
Returns:
337+
Dict of token_id -> weight pairs with positive float values
338+
"""
339+
num_tokens = params.get('num_tokens', 10)
340+
min_weight = params.get('min_weight', 0.01)
341+
max_weight = params.get('max_weight', 1.0)
342+
token_id_start = params.get('token_id_start', 1000)
343+
token_id_step = params.get('token_id_step', 100)
344+
345+
sparse_vector = {}
346+
for i in range(num_tokens):
347+
token_id = str(token_id_start + (i * token_id_step))
348+
weight = random.uniform(min_weight, max_weight)
349+
sparse_vector[token_id] = round(weight, 4) # imitate real neural sparse search models like Splade and DeepImpact
350+
351+
return sparse_vector
352+
261353
def transform_mapping_to_generators(self, mapping_dict: Dict[str, Any], field_path_prefix="") -> Dict[str, Callable[[], Any]]:
262354
"""
263355
Transforms an OpenSearch mapping into a dictionary of field names mapped to generator functions.

tests/synthetic_data_generator/strategies_test.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1285,3 +1285,100 @@ def test_generating_documents_for_with_overrides(self, mapping_converter):
12851285
assert field in document
12861286

12871287
assert document["id"] in ["Helly R", "Mark S", "Irving B"]
1288+
1289+
def test_generate_sparse_vector(self, mapping_converter):
1290+
"""Test basic sparse_vector generation"""
1291+
mapping = {
1292+
"properties": {
1293+
"sparse_embedding": {
1294+
"type": "sparse_vector"
1295+
}
1296+
}
1297+
}
1298+
1299+
generators = mapping_converter.transform_mapping_to_generators(mapping)
1300+
document = MappingConverter.generate_synthetic_document(generators)
1301+
1302+
# Validate field exists
1303+
assert "sparse_embedding" in document
1304+
1305+
# Validate it's a dictionary
1306+
assert isinstance(document["sparse_embedding"], dict)
1307+
1308+
# Validate structure: string keys (token IDs), float values (weights)
1309+
for token_id, weight in document["sparse_embedding"].items():
1310+
assert isinstance(token_id, str)
1311+
assert token_id.isdigit() # Token IDs should be numeric strings
1312+
assert isinstance(weight, (int, float))
1313+
assert weight > 0 # Weights must be positive
1314+
1315+
def test_generate_sparse_vector_with_params(self, mapping_converter):
1316+
"""Test sparse_vector generation with custom parameters"""
1317+
# Override to use custom params
1318+
mapping_converter.mapping_config = {
1319+
"generator_overrides": {
1320+
"sparse_vector": {
1321+
"num_tokens": 5,
1322+
"min_weight": 0.1,
1323+
"max_weight": 0.9,
1324+
"token_id_start": 5000,
1325+
"token_id_step": 50
1326+
}
1327+
},
1328+
"field_overrides": {}
1329+
}
1330+
1331+
mapping = {
1332+
"properties": {
1333+
"embedding": {
1334+
"type": "sparse_vector"
1335+
}
1336+
}
1337+
}
1338+
1339+
generators = mapping_converter.transform_mapping_to_generators(mapping)
1340+
document = MappingConverter.generate_synthetic_document(generators)
1341+
1342+
# Should have exactly 5 tokens
1343+
assert len(document["embedding"]) == 5
1344+
1345+
# Validate token ID range: should be 5000, 5050, 5100, 5150, 5200
1346+
expected_token_ids = {"5000", "5050", "5100", "5150", "5200"}
1347+
assert set(document["embedding"].keys()) == expected_token_ids
1348+
1349+
# All weights should be in range [0.1, 0.9]
1350+
for weight in document["embedding"].values():
1351+
assert 0.1 <= weight <= 0.9
1352+
1353+
def test_generate_sparse_vector_in_complex_mapping(self, mapping_converter):
1354+
"""Test sparse_vector within a complex mapping alongside other field types"""
1355+
mapping = {
1356+
"properties": {
1357+
"text": {"type": "text"},
1358+
"dense_vector": {"type": "knn_vector", "dimension": 3},
1359+
"sparse_vector": {"type": "sparse_vector"},
1360+
"metadata": {
1361+
"type": "object",
1362+
"properties": {
1363+
"id": {"type": "keyword"}
1364+
}
1365+
}
1366+
}
1367+
}
1368+
1369+
generators = mapping_converter.transform_mapping_to_generators(mapping)
1370+
document = MappingConverter.generate_synthetic_document(generators)
1371+
1372+
# Validate all fields generated
1373+
assert "text" in document
1374+
assert "dense_vector" in document
1375+
assert "sparse_vector" in document
1376+
assert "metadata" in document
1377+
1378+
# Validate sparse_vector is correct format
1379+
assert isinstance(document["sparse_vector"], dict)
1380+
assert len(document["sparse_vector"]) > 0
1381+
1382+
# Validate dense_vector is different from sparse_vector
1383+
assert isinstance(document["dense_vector"], list)
1384+
assert len(document["dense_vector"]) == 3

0 commit comments

Comments
 (0)