beehive-lab
diff --git a/‎llama-tornado‎
Lines changed: 39 additions & 23 deletions b/‎llama-tornado‎
Lines changed: 39 additions & 23 deletions
diff --git a/‎pom.xml‎
Lines changed: 23 additions & 0 deletions b/‎pom.xml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/api/LLMApiApplication.java‎
Lines changed: 16 additions & 0 deletions b/‎src/main/java/org/beehive/gpullama3/api/LLMApiApplication.java‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/api/config/ModelConfiguration.java‎
Lines changed: 27 additions & 0 deletions b/‎src/main/java/org/beehive/gpullama3/api/config/ModelConfiguration.java‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/api/controller/CompletionController.java‎
Lines changed: 130 additions & 0 deletions b/‎src/main/java/org/beehive/gpullama3/api/controller/CompletionController.java‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/api/model/CompletionRequest.java‎
Lines changed: 59 additions & 0 deletions b/‎src/main/java/org/beehive/gpullama3/api/model/CompletionRequest.java‎
Lines changed: 59 additions & 0 deletions
@@ -168,11 +168,14 @@ class LlamaRunner:
                 ]
             )
 
+        # Choose main class based on mode
+        main_class = "org.beehive.gpullama3.api.LLMApiApplication" if args.service else "org.beehive.gpullama3.LlamaApp"
+
         module_config.extend(
             [
                 "-cp",
                 f"{self.llama_root}/target/gpu-llama3-1.0-SNAPSHOT.jar",
-                "org.beehive.gpullama3.LlamaApp",
+                main_class,
             ]
         )
         cmd.extend(module_config)
@@ -181,33 +184,28 @@ class LlamaRunner:
 
     def _add_llama_args(self, cmd: List[str], args: argparse.Namespace) -> List[str]:
         """Add LLaMA-specific arguments to the command."""
+
         llama_args = [
-            "-m",
-            args.model_path,
-            "--temperature",
-            str(args.temperature),
-            "--top-p",
-            str(args.top_p),
-            "--seed",
-            str(args.seed),
-            "--max-tokens",
-            str(args.max_tokens),
-            "--stream",
-            str(args.stream).lower(),
-            "--echo",
-            str(args.echo).lower(),
+            "--model", args.model_path,
+            "--temperature", str(args.temperature),
+            "--top-p", str(args.top_p),
+            "--seed", str(args.seed),
+            "--max-tokens", str(args.max_tokens),
+            "--stream", str(args.stream).lower(),
+            "--echo", str(args.echo).lower(),
+            "--instruct"  # Both modes use instruct
         ]
 
-        if args.prompt:
-            llama_args.extend(["-p", args.prompt])
+        # Only add prompt-related args for standalone mode
+        if not hasattr(args, 'service') or not args.service:
+            if hasattr(args, 'prompt') and args.prompt:
+                llama_args.extend(["-p", args.prompt])
 
-        if args.system_prompt:
-            llama_args.extend(["-sp", args.system_prompt])
+            if hasattr(args, 'system_prompt') and args.system_prompt:
+                llama_args.extend(["-sp", args.system_prompt])
 
-        if args.interactive:
-            llama_args.append("--interactive")
-        elif args.instruct:
-            llama_args.append("--instruct")
+            if hasattr(args, 'interactive') and args.interactive:
+                llama_args[-1] = "--interactive"  # Replace --instruct
 
         return cmd + llama_args
 
@@ -219,6 +217,19 @@ class LlamaRunner:
         cmd = self._build_base_command(args)
         cmd = self._add_llama_args(cmd, args)
 
+        # Show service-specific information
+        if args.service:
+            print("Starting TornadoVM LLM REST API Service...")
+            print(f"Model: {args.model_path}")
+            print("API endpoints will be available at:")
+            print("  - http://localhost:8080/v1/completions")
+            print("  - http://localhost:8080/v1/completions/stream")
+            print("  - http://localhost:8080/v1/models")
+            print("  - http://localhost:8080/v1/health")
+            print("\nPress Ctrl+C to stop the service")
+            print("-" * 60)
+
+
         # Print command if requested (before verbose output)
         if args.show_command:
             print("Full Java command:")
@@ -368,6 +379,11 @@ def create_parser() -> argparse.ArgumentParser:
         default=True,
         help="Run in instruction mode (default)",
     )
+    mode_group.add_argument(
+        "--service",
+        action="store_true",
+        help="Run as REST API service instead of standalone application"
+    )
 
     # Hardware configuration
     hw_group = parser.add_argument_group("Hardware Configuration")
 
@@ -12,6 +12,9 @@
         <maven.compiler.source>21</maven.compiler.source>
         <maven.compiler.target>21</maven.compiler.target>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <spring.boot.version>3.2.0</spring.boot.version>
+        <jakarta.version>3.0.0</jakarta.version>
+        <jackson.version>2.16.1</jackson.version>
     </properties>
 
     <dependencies>
@@ -32,6 +35,26 @@
             <artifactId>tornado-runtime</artifactId>
             <version>1.1.2-dev</version>
         </dependency>
+
+        <!-- Spring Boot Starter Web -->
+        <dependency>
+            <groupId>org.springframework.boot</groupId>
+            <artifactId>spring-boot-starter-web</artifactId>
+            <version>${spring.boot.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>jakarta.annotation</groupId>
+            <artifactId>jakarta.annotation-api</artifactId>
+            <version>${jakarta.version}</version>
+        </dependency>
+
+        <!-- For JSON processing -->
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>${jackson.version}</version>
+        </dependency>
     </dependencies>
 
     <build>
 
@@ -0,0 +1,16 @@
+package org.beehive.gpullama3.api;
+
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+
+@SpringBootApplication(scanBasePackages = "org.beehive.gpullama3")
+public class LLMApiApplication {
+
+    public static void main(String[] args) {
+        System.out.println("Starting TornadoVM LLM API Server...");
+        System.out.println("Command line arguments: " + String.join(" ", args));
+
+        // Let Options.parseOptions() handle validation - no duplication
+        SpringApplication.run(LLMApiApplication.class, args);
+    }
+}
@@ -0,0 +1,27 @@
+package org.beehive.gpullama3.api.config;
+
+import org.beehive.gpullama3.model.Model;
+import org.beehive.gpullama3.Options;
+import org.beehive.gpullama3.api.service.ModelInitializationService;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+
+@Configuration
+public class ModelConfiguration {
+
+    /**
+     * Expose Model as a Spring bean using the initialized service
+     */
+    @Bean
+    public Model model(ModelInitializationService initService) {
+        return initService.getModel();
+    }
+
+    /**
+     * Expose Options as a Spring bean using the initialized service
+     */
+    @Bean
+    public Options options(ModelInitializationService initService) {
+        return initService.getOptions();
+    }
+}
@@ -0,0 +1,130 @@
+package org.beehive.gpullama3.api.controller;
+
+import org.beehive.gpullama3.api.model.CompletionRequest;
+import org.beehive.gpullama3.api.model.CompletionResponse;
+import org.beehive.gpullama3.api.service.LLMService;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.http.MediaType;
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.*;
+import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
+
+import java.util.Arrays;
+import java.util.concurrent.CompletableFuture;
+
+@RestController
+@RequestMapping("/v1")
+@CrossOrigin(origins = "*")
+public class CompletionController {
+
+    @Autowired
+    private LLMService llmService;
+
+    @PostMapping("/completions")
+    public CompletableFuture<ResponseEntity<CompletionResponse>> createCompletion(@RequestBody CompletionRequest request) {
+
+        System.out.println("Received completion request: " + request);
+
+        if (Boolean.TRUE.equals(request.getStream())) {
+            throw new IllegalArgumentException("Use /v1/completions/stream for streaming requests");
+        }
+
+        // Validate request
+        if (request.getPrompt() == null || request.getPrompt().trim().isEmpty()) {
+            throw new IllegalArgumentException("Prompt cannot be null or empty");
+        }
+
+        return llmService.generateCompletion(
+                request.getPrompt(),
+                request.getMaxTokens(),
+                request.getTemperature(),
+                request.getTopP(),
+                request.getStopSequences()
+        ).thenApply(generatedText -> {
+            CompletionResponse response = new CompletionResponse();
+            response.setModel(request.getModel());
+
+            CompletionResponse.Choice choice = new CompletionResponse.Choice(
+                    generatedText, 0, "stop");
+            response.setChoices(Arrays.asList(choice));
+
+            // Calculate rough token counts (you might want to make this more accurate)
+            int promptTokens = request.getPrompt().length() / 4; // Rough estimate
+            int completionTokens = generatedText.length() / 4;   // Rough estimate
+            CompletionResponse.Usage usage = new CompletionResponse.Usage(promptTokens, completionTokens);
+            response.setUsage(usage);
+
+            System.out.println("Completion response prepared, length: " + generatedText.length());
+
+            return ResponseEntity.ok(response);
+        });
+    }
+
+    @PostMapping(value = "/completions/stream", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
+    public SseEmitter createStreamingCompletion(@RequestBody CompletionRequest request) {
+
+        System.out.println("Received streaming completion request: " + request);
+
+        // Validate request
+        if (request.getPrompt() == null || request.getPrompt().trim().isEmpty()) {
+            throw new IllegalArgumentException("Prompt cannot be null or empty");
+        }
+
+        SseEmitter emitter = new SseEmitter(Long.MAX_VALUE);
+
+        llmService.generateStreamingCompletion(
+                request.getPrompt(),
+                request.getMaxTokens(),
+                request.getTemperature(),
+                request.getTopP(),
+                request.getStopSequences(),
+                emitter
+        );
+
+        return emitter;
+    }
+
+    @GetMapping("/models")
+    public ResponseEntity<Object> listModels() {
+        return ResponseEntity.ok(new Object() {
+            public final String object = "list";
+            public final Object[] data = new Object[] {
+                    new Object() {
+                        public final String id = "gpullama3";
+                        public final String object = "model";
+                        public final long created = System.currentTimeMillis() / 1000;
+                        public final String owned_by = "beehive";
+                    }
+            };
+        });
+    }
+
+    @GetMapping("/health")
+    public ResponseEntity<Object> health() {
+        return ResponseEntity.ok(new Object() {
+            public final String status = "healthy";
+            public final long timestamp = System.currentTimeMillis();
+        });
+    }
+
+    // Global exception handler for this controller
+    @ExceptionHandler(IllegalArgumentException.class)
+    public ResponseEntity<Object> handleBadRequest(IllegalArgumentException e) {
+        return ResponseEntity.badRequest().body(new Object() {
+            public final String error = e.getMessage();
+            public final long timestamp = System.currentTimeMillis();
+        });
+    }
+
+    @ExceptionHandler(Exception.class)
+    public ResponseEntity<Object> handleInternalError(Exception e) {
+        System.err.println("Internal server error: " + e.getMessage());
+        e.printStackTrace();
+
+        return ResponseEntity.internalServerError().body(new Object() {
+            public final String error = "Internal server error";
+            public final String message = e.getMessage();
+            public final long timestamp = System.currentTimeMillis();
+        });
+    }
+}
@@ -0,0 +1,59 @@
+package org.beehive.gpullama3.api.model;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import java.util.List;
+
+public class CompletionRequest {
+    private String model = "gpullama3";
+    private String prompt;
+
+    @JsonProperty("max_tokens")
+    private Integer maxTokens = 100;
+
+    private Double temperature = 0.7;
+
+    @JsonProperty("top_p")
+    private Double topP = 0.9;
+
+    @JsonProperty("stop")
+    private List<String> stopSequences;
+
+    private Boolean stream = false;
+
+    // Constructors
+    public CompletionRequest() {}
+
+    // Getters and Setters
+    public String getModel() { return model; }
+    public void setModel(String model) { this.model = model; }
+
+    public String getPrompt() { return prompt; }
+    public void setPrompt(String prompt) { this.prompt = prompt; }
+
+    public Integer getMaxTokens() { return maxTokens; }
+    public void setMaxTokens(Integer maxTokens) { this.maxTokens = maxTokens; }
+
+    public Double getTemperature() { return temperature; }
+    public void setTemperature(Double temperature) { this.temperature = temperature; }
+
+    public Double getTopP() { return topP; }
+    public void setTopP(Double topP) { this.topP = topP; }
+
+    public List<String> getStopSequences() { return stopSequences; }
+    public void setStopSequences(List<String> stopSequences) { this.stopSequences = stopSequences; }
+
+    public Boolean getStream() { return stream; }
+    public void setStream(Boolean stream) { this.stream = stream; }
+
+    @Override
+    public String toString() {
+        return "CompletionRequest{" +
+                "model='" + model + '\'' +
+                ", prompt='" + (prompt != null ? prompt.substring(0, Math.min(50, prompt.length())) + "..." : null) + '\'' +
+                ", maxTokens=" + maxTokens +
+                ", temperature=" + temperature +
+                ", topP=" + topP +
+                ", stream=" + stream +
+                '}';
+    }
+}