@@ -76,18 +76,21 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
76
76
seed.
77
77
- Keep max_tokens and max_model_len bounded for speed and memory use.
78
78
"""
79
- random .seed (12345 )
79
+ seed = int (os .getenv ("VLLM_TEST_SEED" , "12345" ))
80
+ random .seed (seed )
80
81
81
82
# Allow overrides from environment (useful for CI tuning)
82
83
# "facebook/opt-125m" is too small, doesn't reliably test determinism
83
84
model = os .getenv ("VLLM_TEST_MODEL" , "Qwen/Qwen3-1.7B" )
84
85
num_trials = int (os .getenv ("VLLM_NEEDLE_TRIALS" , "5" ))
85
- batch_size = int (os .getenv ("VLLM_NEEDLE_BATCH_SIZE" , "64" ))
86
- assert batch_size >= 2 , "Batch size should be >= 2 to mix needle."
86
+ max_batch_size = int (os .getenv ("VLLM_NEEDLE_BATCH_SIZE" , "128" ))
87
+ min_random_prompt = int (os .getenv ("VLLM_MIN_PROMPT" , "1024" ))
88
+ max_random_prompt = int (os .getenv ("VLLM_MAX_PROMPT" , "2048" ))
89
+ assert max_batch_size >= 2 , "Batch size should be >= 2 to mix needle."
87
90
88
91
# Keep GPU memory usage low to avoid startup allocation failures.
89
- gpu_mem_util = float (os .getenv ("VLLM_GPU_MEMORY_UTILIZATION" , "0.3 " ))
90
- max_model_len = int (os .getenv ("VLLM_MAX_MODEL_LEN" , "4096 " ))
92
+ gpu_mem_util = float (os .getenv ("VLLM_GPU_MEMORY_UTILIZATION" , "0.4 " ))
93
+ max_model_len = int (os .getenv ("VLLM_MAX_MODEL_LEN" , "5120 " ))
91
94
swap_space_gb = int (os .getenv ("VLLM_SWAP_SPACE_GB" , "4" ))
92
95
93
96
# Sampling parameters: longer outputs with a more random-sounding
@@ -111,7 +114,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
111
114
# Engine with bs=1 behavior
112
115
llm_bs1 = LLM_with_max_seqs (
113
116
model = model ,
114
- max_num_seqs = 1 ,
117
+ max_num_seqs = 128 ,
115
118
gpu_memory_utilization = gpu_mem_util ,
116
119
max_model_len = max_model_len ,
117
120
swap_space = swap_space_gb ,
@@ -126,7 +129,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
126
129
# Engine with larger batch limit (e.g., 64)
127
130
llm_bsN = LLM_with_max_seqs (
128
131
model = model ,
129
- max_num_seqs = batch_size ,
132
+ max_num_seqs = 128 ,
130
133
gpu_memory_utilization = gpu_mem_util ,
131
134
max_model_len = max_model_len ,
132
135
swap_space = swap_space_gb ,
@@ -135,15 +138,17 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
135
138
mismatches = 0
136
139
137
140
for trial in range (num_trials ):
138
- # Create a batch of size `batch_size ` and insert the needle at
141
+ # Create a batch of size `max_batch_size ` and insert the needle at
139
142
# a random index
140
143
prompts : list [str ] = []
144
+ batch_size = random .randint (max_batch_size // 2 , max_batch_size )
141
145
needle_pos = random .randint (0 , batch_size - 1 )
142
146
for i in range (batch_size ):
143
147
if i == needle_pos :
144
148
prompts .append (needle_prompt )
145
149
else :
146
- prompts .append (_random_prompt ())
150
+ prompts .append (
151
+ _random_prompt (min_random_prompt , max_random_prompt ))
147
152
148
153
# Generate with the larger-batch engine
149
154
outputs = llm_bsN .generate (prompts , sampling )
@@ -154,17 +159,19 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
154
159
text = needle_output .outputs [0 ].text
155
160
156
161
if text != baseline_text :
162
+ print (
163
+ f"{ text } \n \n == Not the same as ==\n \n { baseline_text } \n \n " )
157
164
mismatches += 1
158
165
159
166
passes = num_trials - mismatches
160
167
# Dump how many passed vs failed
161
168
print (f"[determinism] total={ num_trials } , passed={ passes } , "
162
- f"failed={ mismatches } , batch_size= { batch_size } " )
169
+ f"failed={ mismatches } , max_batch_size= { max_batch_size } " )
163
170
164
171
if mismatches > 0 :
165
172
pytest .fail (
166
173
f"Nondeterministic outputs detected: { mismatches } failed out "
167
- f"of { num_trials } trials (batch_size= { batch_size } )." )
174
+ f"of { num_trials } trials (max_batch_size= { max_batch_size } )." )
168
175
169
176
finally :
170
177
# Ensure engines are shutdown to free GPU/VRAM across test sessions
@@ -196,9 +203,14 @@ def _extract_step_logprobs(request_output):
196
203
not torch .cuda .is_available (),
197
204
reason = "Requires CUDA to match production inference path." ,
198
205
)
199
- def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2 ():
206
+ @pytest .mark .parametrize ("backend" , ["FLEX_ATTENTION" , "FLASHINFER" ])
207
+ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN (backend ):
200
208
201
- #model_name = os.getenv("VLLM_TEST_MODEL", "facebook/opt-125m")
209
+ backend = os .getenv ("VLLM_ATTENTION_BACKEND" , backend )
210
+ os .environ ["VLLM_ATTENTION_BACKEND" ] = backend
211
+
212
+ seed = int (os .getenv ("VLLM_TEST_SEED" , "12345" ))
213
+ random .seed (seed )
202
214
model_name = os .getenv ("VLLM_TEST_MODEL" , "Qwen/Qwen3-1.7B" )
203
215
tp_size = int (os .getenv ("VLLM_TEST_TP_SIZE" , "1" ))
204
216
@@ -212,10 +224,15 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2():
212
224
prompts = [
213
225
"The capital of France is" ,
214
226
"The capital of Germany is" ,
227
+ _random_prompt (10 , 1024 ),
228
+ _random_prompt (10 , 1024 ),
229
+ _random_prompt (10 , 1024 ),
230
+ _random_prompt (10 , 1024 ),
231
+ _random_prompt (10 , 1024 ),
215
232
]
216
233
217
234
sp = SamplingParams (
218
- temperature = 0.0 ,
235
+ temperature = 0.6 ,
219
236
top_p = 1.0 ,
220
237
max_tokens = 8 ,
221
238
# Seed shouldn't matter at temperature=0, but keeping it stable anyway.
@@ -234,25 +251,25 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2():
234
251
"enable logprobs return to run this test." )
235
252
bs1_logprobs_per_prompt .append (step_logprobs )
236
253
237
- # BS=2 : run prompts in a batch and collect logprobs per step for each
254
+ # BS=N : run prompts in a batch and collect logprobs per step for each
238
255
# prompt.
239
256
outs_batched = llm .generate (prompts , sp , use_tqdm = False )
240
257
assert len (outs_batched ) == len (prompts )
241
- bs2_logprobs_per_prompt = []
258
+ bsN_logprobs_per_prompt = []
242
259
for o in outs_batched :
243
260
step_logprobs = _extract_step_logprobs (o )
244
261
if step_logprobs is None :
245
262
pytest .skip ("Logits are not available on RequestOutput; "
246
263
"enable logprobs return to run this test." )
247
- bs2_logprobs_per_prompt .append (step_logprobs )
264
+ bsN_logprobs_per_prompt .append (step_logprobs )
248
265
249
- # Compare step-by-step logprobs for each prompt between BS=1 and BS=2 runs.
250
- for i , (logprobs_bs1 , logprobs_bs2 ) in enumerate (
251
- zip (bs1_logprobs_per_prompt , bs2_logprobs_per_prompt )):
252
- assert len (logprobs_bs1 ) == len (logprobs_bs2 ), (
266
+ # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
267
+ for i , (logprobs_bs1 , logprobs_bsN ) in enumerate (
268
+ zip (bs1_logprobs_per_prompt , bsN_logprobs_per_prompt )):
269
+ assert len (logprobs_bs1 ) == len (logprobs_bsN ), (
253
270
f"Different number of generation steps for prompt index { i } : "
254
- f"{ len (logprobs_bs1 )} (BS=1) vs { len (logprobs_bs2 )} (BS=2 )" )
255
- for t , (a , b ) in enumerate (zip (logprobs_bs1 , logprobs_bs2 )):
271
+ f"{ len (logprobs_bs1 )} (BS=1) vs { len (logprobs_bsN )} (BS=N )" )
272
+ for t , (a , b ) in enumerate (zip (logprobs_bs1 , logprobs_bsN )):
256
273
assert a .shape == b .shape , (
257
274
f"Logits shape mismatch at prompt { i } , step { t } : "
258
275
f"{ a .shape } vs { b .shape } " )
0 commit comments