@@ -136,7 +136,6 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
136
136
IntConsumer onTokenGenerated ) {
137
137
// Start timing the whole process
138
138
long startNanos = System .nanoTime ();
139
- long startGen = 0 ;
140
139
long inferenceStartNanos = 0 ;
141
140
142
141
// Validate and adjust maxTokens if necessary
@@ -159,15 +158,8 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
159
158
// We're still processing the prompt tokens
160
159
final int token = promptTokens .get (promptIndex );
161
160
162
- //System.out.println("Token: " + token);
163
161
model .forward (state , token , position );
164
162
165
- // System.out.println("Token = " + token + " -> state.logits = { " +
166
- // state.logits.getFloat(0) + ", " +
167
- // state.logits.getFloat(1) + ", " +
168
- // state.logits.getFloat(2) + ", " +
169
- // state.logits.getFloat(3) + " }");
170
-
171
163
promptIndex ++;
172
164
if (promptIndex < promptTokens .size ()) {
173
165
continue ;
@@ -176,36 +168,19 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
176
168
System .err .print (Tokenizer .replaceControlCharacters (model .tokenizer ().decode (List .of (nextToken ))));
177
169
}
178
170
// We have reached the last prompt token and computed the first response-token.
179
- startGen = System .nanoTime ();
180
171
position ++; // The current logit belongs to the next position
181
172
} else {
182
173
// Mark the start of actual generation (after prompt processing)
183
174
if (inferenceStartNanos == 0 ) {
184
175
inferenceStartNanos = System .nanoTime ();
185
176
}
186
177
187
- //System.out.println("currentToken: " + currentToken);
188
178
model .forward (state , currentToken , position );
189
-
190
- // System.out.println("currentToken = " + currentToken + " -> state.logits = { " +
191
- // state.logits.getFloat(0) + ", " +
192
- // state.logits.getFloat(1) + ", " +
193
- // state.logits.getFloat(2) + ", " +
194
- // state.logits.getFloat(3) + " }");
195
-
196
179
}
197
180
198
- // System.out.print("state.logits = { " +
199
- // state.logits.getFloat(0) + ", " +
200
- // state.logits.getFloat(1) + ", " +
201
- // state.logits.getFloat(2) + ", " +
202
- // state.logits.getFloat(3) + "}");
203
-
204
181
// Sample the next token
205
182
nextToken = sampler .sampleToken (state .logits );
206
183
207
- //System.out.println(", nextToken: " + nextToken);
208
-
209
184
// Output the token if echo is enabled
210
185
if (echo ) {
211
186
System .err .print (Tokenizer .replaceControlCharacters (model .tokenizer ().decode (List .of (nextToken ))));
@@ -328,12 +303,10 @@ public static List<Integer> generateTokensGPU(Model model, State state, int star
328
303
return generatedTokens ;
329
304
}
330
305
331
- // probably not needed TODO: check this when its working
332
306
public static List <Integer > generateTokensGPUQwen3 (Model model , State state , int startPosition , List <Integer > promptTokens , Set <Integer > stopTokens , int maxTokens , Sampler sampler , boolean echo ,
333
307
IntConsumer onTokenGenerated , TornadoVMMasterPlan tornadoVMPlan ) {
334
308
// Start timing the whole process
335
309
long startNanos = System .nanoTime ();
336
- long startGen = 0 ;
337
310
long inferenceStartNanos = 0 ;
338
311
339
312
// Pre-validate the max tokens to avoid checking in the loop
@@ -369,12 +342,6 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int
369
342
//System.out.println("Token: " + token);
370
343
model .forward (state , token , position );
371
344
372
- // System.out.println("Token = " + token + " -> state.wrapLogits = { " +
373
- // state.wrapLogits.get(0) + ", " +
374
- // state.wrapLogits.get(1) + ", " +
375
- // state.wrapLogits.get(2) + ", " +
376
- // state.wrapLogits.get(3) + " }");
377
-
378
345
promptIndex ++;
379
346
if (promptIndex < promptTokens .size ()) {
380
347
continue ;
@@ -383,31 +350,19 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int
383
350
System .err .print (Tokenizer .replaceControlCharacters (model .tokenizer ().decode (List .of (nextToken ))));
384
351
}
385
352
// We have reached the last prompt token and computed the first response-token.
386
- startGen = System .nanoTime ();
387
353
position ++; // The current logit belongs to the next position
388
354
} else {
389
355
// Mark the start of actual generation (after prompt processing)
390
356
if (inferenceStartNanos == 0 ) {
391
357
inferenceStartNanos = System .nanoTime ();
392
358
}
393
359
394
- //System.out.println("currentToken: " + currentToken);
395
360
model .forward (state , currentToken , position );
396
-
397
- // System.out.println("currentToken = " + currentToken + " -> state.wrapLogits = { " +
398
- // state.wrapLogits.get(0) + ", " +
399
- // state.wrapLogits.get(1) + ", " +
400
- // state.wrapLogits.get(2) + ", " +
401
- // state.wrapLogits.get(3) + " }");
402
-
403
361
}
404
362
405
-
406
363
// Sample the next token
407
364
nextToken = sampler .sampleToken (state .wrapLogits );
408
365
409
- //System.out.println(", nextToken: "+ nextToken);
410
-
411
366
// Output the token if echo is enabled
412
367
if (echo ) {
413
368
System .err .print (Tokenizer .replaceControlCharacters (model .tokenizer ().decode (List .of (nextToken ))));
0 commit comments