Skip to content

Commit 66b5311

Browse files
committed
Add more prefix bucket testcases
Signed-off-by: Samuel Monson <[email protected]>
1 parent 5795c02 commit 66b5311

File tree

1 file changed

+218
-2
lines changed

1 file changed

+218
-2
lines changed

tests/unit/dataset/test_synthetic.py

Lines changed: 218 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,76 @@
1818
)
1919

2020

21+
class TestPrefixBucketConfig:
22+
"""Test cases for PrefixBucketConfig class.
23+
24+
### WRITTEN BY AI ###
25+
"""
26+
27+
@pytest.mark.smoke
28+
def test_creation_with_valid_params(self):
29+
"""Test creating PrefixBucketConfig with valid parameters.
30+
31+
### WRITTEN BY AI ###
32+
"""
33+
config = PrefixBucketConfig(bucket_weight=100, prefix_count=1, prefix_tokens=5)
34+
35+
assert config.bucket_weight == 100
36+
assert config.prefix_count == 1
37+
assert config.prefix_tokens == 5
38+
39+
@pytest.mark.sanity
40+
def test_creation_with_negative_values(self):
41+
"""Test creating PrefixBucketConfig with negative values raises ValueError.
42+
43+
### WRITTEN BY AI ###
44+
"""
45+
with pytest.raises(ValueError):
46+
PrefixBucketConfig(bucket_weight=-10, prefix_count=1, prefix_tokens=5)
47+
48+
with pytest.raises(ValueError):
49+
PrefixBucketConfig(bucket_weight=100, prefix_count=-1, prefix_tokens=5)
50+
51+
with pytest.raises(ValueError):
52+
PrefixBucketConfig(bucket_weight=100, prefix_count=1, prefix_tokens=-5)
53+
54+
@pytest.mark.regression
55+
def test_prefix_bucket_zero_weight_error(self):
56+
"""Test that zero total weight raises an error.
57+
58+
### WRITTEN BY AI ###
59+
"""
60+
# Test validation error for creating PrefixBucketConfig with weight=0
61+
with pytest.raises(ValueError):
62+
PrefixBucketConfig(bucket_weight=0, prefix_count=1, prefix_tokens=2)
63+
64+
@pytest.mark.sanity
65+
def test_prefix_bucket_config_validation(self):
66+
"""Test PrefixBucketConfig validation.
67+
68+
### WRITTEN BY AI ###
69+
"""
70+
# Test valid config
71+
valid_config = PrefixBucketConfig(
72+
bucket_weight=50, prefix_count=2, prefix_tokens=3
73+
)
74+
assert valid_config.bucket_weight == 50
75+
assert valid_config.prefix_count == 2
76+
assert valid_config.prefix_tokens == 3
77+
78+
# Test invalid bucket_weight
79+
with pytest.raises(ValueError):
80+
PrefixBucketConfig(bucket_weight=0, prefix_count=1, prefix_tokens=2)
81+
82+
# Test invalid prefix_count
83+
with pytest.raises(ValueError):
84+
PrefixBucketConfig(bucket_weight=100, prefix_count=0, prefix_tokens=2)
85+
86+
# Test invalid prefix_tokens
87+
with pytest.raises(ValueError):
88+
PrefixBucketConfig(bucket_weight=100, prefix_count=1, prefix_tokens=-1)
89+
90+
2191
class TestSyntheticDatasetConfig:
2292
"""Test cases for SyntheticDatasetConfig class.
2393
@@ -306,10 +376,11 @@ def mock_integer_range_sampler(self):
306376
### WRITTEN BY AI ###
307377
"""
308378
with patch("guidellm.dataset.synthetic.IntegerRangeSampler") as mock_sampler:
309-
# Default side effect for basic iteration
379+
# Side effect for basic iteration with enough values for larger tests
310380
def mock_sampler_side_effect(*args, **kwargs):
311381
mock_instance = Mock()
312-
mock_instance.__iter__ = Mock(return_value=iter([15, 15, 15, 15, 15]))
382+
# Provide enough values for tests (up to 20 items)
383+
mock_instance.__iter__ = Mock(return_value=iter([15] * 20))
313384
return mock_instance
314385

315386
mock_sampler.side_effect = mock_sampler_side_effect
@@ -346,6 +417,45 @@ def config_with_prefix(self):
346417
source="The quick brown fox jumps over the lazy dog.",
347418
)
348419

420+
@pytest.fixture
421+
def config_with_multiple_prefix_buckets(self):
422+
"""Fixture for configuration with multiple prefix buckets.
423+
424+
### WRITTEN BY AI ###
425+
"""
426+
prefix_bucket1 = PrefixBucketConfig(
427+
bucket_weight=60, prefix_count=1, prefix_tokens=2
428+
)
429+
prefix_bucket2 = PrefixBucketConfig(
430+
bucket_weight=40, prefix_count=1, prefix_tokens=4
431+
)
432+
433+
return SyntheticDatasetConfig(
434+
prefix_buckets=[prefix_bucket1, prefix_bucket2],
435+
prompt_tokens=10,
436+
output_tokens=5,
437+
samples=10,
438+
source="The quick brown fox jumps over the lazy dog.",
439+
)
440+
441+
@pytest.fixture
442+
def config_with_multiple_prefix_counts(self):
443+
"""Fixture for configuration with prefix_count > 1.
444+
445+
### WRITTEN BY AI ###
446+
"""
447+
prefix_bucket = PrefixBucketConfig(
448+
bucket_weight=100, prefix_count=3, prefix_tokens=2
449+
)
450+
451+
return SyntheticDatasetConfig(
452+
prefix_buckets=[prefix_bucket],
453+
prompt_tokens=8,
454+
output_tokens=4,
455+
samples=6,
456+
source="The quick brown fox jumps over the lazy dog.",
457+
)
458+
349459
@pytest.fixture
350460
def complex_config(self):
351461
"""Fixture for complex configuration with variance.
@@ -552,6 +662,112 @@ def test_unique_prefix_generation(self, simple_config, mock_tokenizer):
552662
# Verify cycle was called with vocab values
553663
mock_cycle.assert_called_once()
554664

665+
@pytest.mark.regression
666+
def test_multiple_prefix_buckets_distribution(
667+
self,
668+
mock_integer_range_sampler,
669+
config_with_multiple_prefix_buckets,
670+
mock_tokenizer,
671+
):
672+
"""Test distribution across multiple prefix buckets with different weights.
673+
674+
### WRITTEN BY AI ###
675+
"""
676+
generator = SyntheticTextItemsGenerator(
677+
config_with_multiple_prefix_buckets, mock_tokenizer, random_seed=42
678+
)
679+
680+
items = list(generator)
681+
682+
# Verify we get the expected number of items
683+
assert len(items) == config_with_multiple_prefix_buckets.samples
684+
685+
# Verify that prefix tokens are added to prompt_tokens_count
686+
# Since we have buckets with 2 and 4 prefix tokens, and the mock returns 15
687+
# prompt tokens, we should see prompt_tokens_count of either 17 or 19
688+
prefix_counts = [item["prompt_tokens_count"] for item in items]
689+
assert all(count in [17, 19] for count in prefix_counts)
690+
691+
# Calculate expected distribution based on weights
692+
# Bucket 1: weight=60, prefix_count=1, prefix_tokens=2
693+
# Bucket 2: weight=40, prefix_count=1, prefix_tokens=4
694+
# Total weight = 100, samples = 10
695+
# Bucket 1: (60/1/100) * 10 = 6 samples with 17 tokens (2 prefix + 15 prompt)
696+
# Bucket 2: (40/1/100) * 10 = 4 samples with 19 tokens (4 prefix + 15 prompt)
697+
count_17 = prefix_counts.count(17) # 2 prefix tokens
698+
count_19 = prefix_counts.count(19) # 4 prefix tokens
699+
assert count_17 == 6
700+
assert count_19 == 4
701+
702+
@pytest.mark.regression
703+
def test_multiple_prefix_counts(
704+
self,
705+
mock_integer_range_sampler,
706+
config_with_multiple_prefix_counts,
707+
mock_tokenizer,
708+
):
709+
"""Test prefix buckets with prefix_count > 1.
710+
711+
### WRITTEN BY AI ###
712+
"""
713+
generator = SyntheticTextItemsGenerator(
714+
config_with_multiple_prefix_counts, mock_tokenizer, random_seed=42
715+
)
716+
717+
items = list(generator)
718+
719+
# Verify we get the expected number of items
720+
assert len(items) == config_with_multiple_prefix_counts.samples
721+
722+
# All items should have 2 prefix tokens + 15 prompt tokens = 17 total
723+
for item in items:
724+
assert item["prompt_tokens_count"] == 17
725+
726+
@pytest.mark.sanity
727+
def test_prefix_buckets_create_prefixes_method(
728+
self, config_with_multiple_prefix_buckets, mock_tokenizer
729+
):
730+
"""Test the _create_prefixes method directly.
731+
732+
### WRITTEN BY AI ###
733+
"""
734+
generator = SyntheticTextItemsGenerator(
735+
config_with_multiple_prefix_buckets, mock_tokenizer, random_seed=42
736+
)
737+
738+
# Test _create_prefixes method
739+
rand = Mock()
740+
rand.randint = Mock(return_value=0)
741+
prefixes = generator._create_prefixes(rand)
742+
743+
# Should return a sequence of prefix token lists
744+
assert isinstance(prefixes, list)
745+
assert len(prefixes) == 10
746+
747+
# Each prefix should be a list of integers
748+
for prefix in prefixes:
749+
assert isinstance(prefix, list)
750+
assert all(isinstance(token, int) for token in prefix)
751+
752+
@pytest.mark.regression
753+
def test_empty_prefix_buckets(
754+
self, mock_integer_range_sampler, simple_config, mock_tokenizer
755+
):
756+
"""Test behavior when prefix_buckets is None or empty.
757+
758+
### WRITTEN BY AI ###
759+
"""
760+
# Test with None prefix_buckets (simple_config has None)
761+
generator = SyntheticTextItemsGenerator(
762+
simple_config, mock_tokenizer, random_seed=42
763+
)
764+
765+
items = list(generator)
766+
767+
# All items should have exactly the prompt tokens (no prefix)
768+
for item in items:
769+
assert item["prompt_tokens_count"] == 15 # Mock returns 15
770+
555771

556772
class TestSyntheticDatasetCreator:
557773
"""Test cases for SyntheticDatasetCreator class.

0 commit comments

Comments
 (0)