Cleanup InferenceEngine

orionpapadakis · orionpapadakis · commit 5f3b6c2723a1 · 2025-07-30T16:04:17.000+03:00
diff --git a/src/main/java/com/example/inference/InferenceEngine.java b/src/main/java/com/example/inference/InferenceEngine.java
@@ -136,7 +136,6 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
             IntConsumer onTokenGenerated) {
         // Start timing the whole process
         long startNanos = System.nanoTime();
-        long startGen = 0;
         long inferenceStartNanos = 0;
 
         // Validate and adjust maxTokens if necessary
@@ -159,15 +158,8 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
                 // We're still processing the prompt tokens
                 final int token = promptTokens.get(promptIndex);
 
-                //System.out.println("Token: " + token);
                 model.forward(state, token, position);
 
-//                System.out.println("Token = " + token + " -> state.logits = { " +
-//                        state.logits.getFloat(0) + ", " +
-//                        state.logits.getFloat(1) + ", " +
-//                        state.logits.getFloat(2) + ", " +
-//                        state.logits.getFloat(3) + " }");
-
                 promptIndex++;
                 if (promptIndex < promptTokens.size()) {
                     continue;
@@ -176,36 +168,19 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
                     System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
                 }
                 // We have reached the last prompt token and computed the first response-token.
-                startGen = System.nanoTime();
                 position++; // The current logit belongs to the next position
             } else {
                 // Mark the start of actual generation (after prompt processing)
                 if (inferenceStartNanos == 0) {
                     inferenceStartNanos = System.nanoTime();
                 }
 
-                //System.out.println("currentToken: " + currentToken);
                 model.forward(state, currentToken, position);
-
-//                System.out.println("currentToken = " + currentToken + " -> state.logits = { " +
-//                        state.logits.getFloat(0) + ", " +
-//                        state.logits.getFloat(1) + ", " +
-//                        state.logits.getFloat(2) + ", " +
-//                        state.logits.getFloat(3) + " }");
-
             }
 
-//            System.out.print("state.logits = { " +
-//                            state.logits.getFloat(0) + ", " +
-//                            state.logits.getFloat(1) + ", " +
-//                            state.logits.getFloat(2) + ", " +
-//                            state.logits.getFloat(3) + "}");
-
             // Sample the next token
             nextToken = sampler.sampleToken(state.logits);
 
-            //System.out.println(", nextToken: " + nextToken);
-
             // Output the token if echo is enabled
             if (echo) {
                 System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
@@ -328,12 +303,10 @@ public static List<Integer> generateTokensGPU(Model model, State state, int star
         return generatedTokens;
     }
 
-    // probably not needed TODO: check this when its working
     public static List<Integer> generateTokensGPUQwen3(Model model, State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
             IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
         // Start timing the whole process
         long startNanos = System.nanoTime();
-        long startGen = 0;
         long inferenceStartNanos = 0;
 
         // Pre-validate the max tokens to avoid checking in the loop
@@ -369,12 +342,6 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int
                 //System.out.println("Token: " + token);
                 model.forward(state, token, position);
 
-//                System.out.println("Token = " + token + " -> state.wrapLogits = { " +
-//                        state.wrapLogits.get(0) + ", " +
-//                        state.wrapLogits.get(1) + ", " +
-//                        state.wrapLogits.get(2) + ", " +
-//                        state.wrapLogits.get(3) + " }");
-
                 promptIndex++;
                 if (promptIndex < promptTokens.size()) {
                     continue;
@@ -383,31 +350,19 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int
                     System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
                 }
                 // We have reached the last prompt token and computed the first response-token.
-                startGen = System.nanoTime();
                 position++; // The current logit belongs to the next position
             } else {
                 // Mark the start of actual generation (after prompt processing)
                 if (inferenceStartNanos == 0) {
                     inferenceStartNanos = System.nanoTime();
                 }
 
-                //System.out.println("currentToken: " + currentToken);
                 model.forward(state, currentToken, position);
-
-//                System.out.println("currentToken = " + currentToken + " -> state.wrapLogits = { " +
-//                        state.wrapLogits.get(0) + ", " +
-//                        state.wrapLogits.get(1) + ", " +
-//                        state.wrapLogits.get(2) + ", " +
-//                        state.wrapLogits.get(3) + " }");
-
             }
 
-
             // Sample the next token
             nextToken = sampler.sampleToken(state.wrapLogits);
 
-            //System.out.println(", nextToken: "+ nextToken);
-
             // Output the token if echo is enabled
             if (echo) {
                 System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));