Skip to content

Commit 5f3b6c2

Browse files
Cleanup InferenceEngine
1 parent 9f5929a commit 5f3b6c2

File tree

1 file changed

+0
-45
lines changed

1 file changed

+0
-45
lines changed

src/main/java/com/example/inference/InferenceEngine.java

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,6 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
136136
IntConsumer onTokenGenerated) {
137137
// Start timing the whole process
138138
long startNanos = System.nanoTime();
139-
long startGen = 0;
140139
long inferenceStartNanos = 0;
141140

142141
// Validate and adjust maxTokens if necessary
@@ -159,15 +158,8 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
159158
// We're still processing the prompt tokens
160159
final int token = promptTokens.get(promptIndex);
161160

162-
//System.out.println("Token: " + token);
163161
model.forward(state, token, position);
164162

165-
// System.out.println("Token = " + token + " -> state.logits = { " +
166-
// state.logits.getFloat(0) + ", " +
167-
// state.logits.getFloat(1) + ", " +
168-
// state.logits.getFloat(2) + ", " +
169-
// state.logits.getFloat(3) + " }");
170-
171163
promptIndex++;
172164
if (promptIndex < promptTokens.size()) {
173165
continue;
@@ -176,36 +168,19 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
176168
System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
177169
}
178170
// We have reached the last prompt token and computed the first response-token.
179-
startGen = System.nanoTime();
180171
position++; // The current logit belongs to the next position
181172
} else {
182173
// Mark the start of actual generation (after prompt processing)
183174
if (inferenceStartNanos == 0) {
184175
inferenceStartNanos = System.nanoTime();
185176
}
186177

187-
//System.out.println("currentToken: " + currentToken);
188178
model.forward(state, currentToken, position);
189-
190-
// System.out.println("currentToken = " + currentToken + " -> state.logits = { " +
191-
// state.logits.getFloat(0) + ", " +
192-
// state.logits.getFloat(1) + ", " +
193-
// state.logits.getFloat(2) + ", " +
194-
// state.logits.getFloat(3) + " }");
195-
196179
}
197180

198-
// System.out.print("state.logits = { " +
199-
// state.logits.getFloat(0) + ", " +
200-
// state.logits.getFloat(1) + ", " +
201-
// state.logits.getFloat(2) + ", " +
202-
// state.logits.getFloat(3) + "}");
203-
204181
// Sample the next token
205182
nextToken = sampler.sampleToken(state.logits);
206183

207-
//System.out.println(", nextToken: " + nextToken);
208-
209184
// Output the token if echo is enabled
210185
if (echo) {
211186
System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
@@ -328,12 +303,10 @@ public static List<Integer> generateTokensGPU(Model model, State state, int star
328303
return generatedTokens;
329304
}
330305

331-
// probably not needed TODO: check this when its working
332306
public static List<Integer> generateTokensGPUQwen3(Model model, State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
333307
IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
334308
// Start timing the whole process
335309
long startNanos = System.nanoTime();
336-
long startGen = 0;
337310
long inferenceStartNanos = 0;
338311

339312
// Pre-validate the max tokens to avoid checking in the loop
@@ -369,12 +342,6 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int
369342
//System.out.println("Token: " + token);
370343
model.forward(state, token, position);
371344

372-
// System.out.println("Token = " + token + " -> state.wrapLogits = { " +
373-
// state.wrapLogits.get(0) + ", " +
374-
// state.wrapLogits.get(1) + ", " +
375-
// state.wrapLogits.get(2) + ", " +
376-
// state.wrapLogits.get(3) + " }");
377-
378345
promptIndex++;
379346
if (promptIndex < promptTokens.size()) {
380347
continue;
@@ -383,31 +350,19 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int
383350
System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
384351
}
385352
// We have reached the last prompt token and computed the first response-token.
386-
startGen = System.nanoTime();
387353
position++; // The current logit belongs to the next position
388354
} else {
389355
// Mark the start of actual generation (after prompt processing)
390356
if (inferenceStartNanos == 0) {
391357
inferenceStartNanos = System.nanoTime();
392358
}
393359

394-
//System.out.println("currentToken: " + currentToken);
395360
model.forward(state, currentToken, position);
396-
397-
// System.out.println("currentToken = " + currentToken + " -> state.wrapLogits = { " +
398-
// state.wrapLogits.get(0) + ", " +
399-
// state.wrapLogits.get(1) + ", " +
400-
// state.wrapLogits.get(2) + ", " +
401-
// state.wrapLogits.get(3) + " }");
402-
403361
}
404362

405-
406363
// Sample the next token
407364
nextToken = sampler.sampleToken(state.wrapLogits);
408365

409-
//System.out.println(", nextToken: "+ nextToken);
410-
411366
// Output the token if echo is enabled
412367
if (echo) {
413368
System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));

0 commit comments

Comments
 (0)