Skip to content

Commit 0555702

Browse files
authored
Merge pull request #518 from nerdalert/osx-convert-endpoint
Add a convert endpoint for OSX llama.cpp serving of gguf models
2 parents a17cda4 + 60cbaae commit 0555702

File tree

3 files changed

+144
-10
lines changed

3 files changed

+144
-10
lines changed

api-server/.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ go.work.sum
2525
api-server
2626
ilab-api-server
2727

28+
# db file
29+
jobs.db
30+
2831
# app specific
2932
logs/
30-
jobs.json

api-server/convert-model.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
package main
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"net/http"
7+
"os"
8+
"os/exec"
9+
"path/filepath"
10+
"time"
11+
)
12+
13+
// convertModelHandler is the HTTP handler for the /model/convert endpoint.
14+
// This endpoint is valid only on OSX machines (srv.isOSX == true).
15+
func (srv *ILabServer) convertModelHandler(w http.ResponseWriter, r *http.Request) {
16+
srv.log.Info("POST /model/convert called")
17+
18+
// If not OS X, return an error.
19+
if !srv.isOSX {
20+
srv.log.Warn("Attempt to use /model/convert on a non-OSX machine")
21+
http.Error(w, "Model conversion endpoint is available only on OSX", http.StatusForbidden)
22+
return
23+
}
24+
25+
var reqBody struct {
26+
ModelDir string `json:"model_dir"`
27+
}
28+
if err := json.NewDecoder(r.Body).Decode(&reqBody); err != nil {
29+
srv.log.Errorf("Error parsing convert request body: %v", err)
30+
http.Error(w, "Invalid request body", http.StatusBadRequest)
31+
return
32+
}
33+
34+
// Ensure we have a model directory path
35+
if reqBody.ModelDir == "" {
36+
srv.log.Info("Missing required parameter: model_dir")
37+
http.Error(w, "Missing required parameter: model_dir", http.StatusBadRequest)
38+
return
39+
}
40+
41+
jobID, err := srv.startConvertJob(reqBody.ModelDir)
42+
if err != nil {
43+
srv.log.Errorf("Error starting convert job: %v", err)
44+
http.Error(w, fmt.Sprintf("Failed to start convert job: %v", err), http.StatusInternalServerError)
45+
return
46+
}
47+
48+
w.Header().Set("Content-Type", "application/json")
49+
_ = json.NewEncoder(w).Encode(map[string]string{"job_id": jobID})
50+
srv.log.Infof("POST /model/convert started successfully, job_id: %s", jobID)
51+
}
52+
53+
// startConvertJob launches "ilab model convert --model-dir=..."
54+
func (srv *ILabServer) startConvertJob(modelDir string) (string, error) {
55+
ilabPath := srv.getIlabCommand()
56+
57+
cmdArgs := []string{
58+
"model", "convert",
59+
fmt.Sprintf("--model-dir=%s", modelDir),
60+
}
61+
62+
// Unique job ID & log file
63+
jobID := fmt.Sprintf("c-%d", time.Now().UnixNano())
64+
logFilePath := filepath.Join("logs", fmt.Sprintf("%s.log", jobID))
65+
66+
finalCmdString := fmt.Sprintf("[ILAB CONVERT COMMAND] %s %v", ilabPath, cmdArgs)
67+
srv.log.Info(finalCmdString)
68+
69+
cmd := exec.Command(ilabPath, cmdArgs...)
70+
if !srv.rhelai {
71+
cmd.Dir = srv.baseDir
72+
}
73+
74+
// Log the job
75+
logFile, err := os.Create(logFilePath)
76+
if err != nil {
77+
return "", fmt.Errorf("failed to create log file for convert job: %v", err)
78+
}
79+
fmt.Fprintln(logFile, finalCmdString)
80+
81+
cmd.Stdout = logFile
82+
cmd.Stderr = logFile
83+
84+
srv.log.Infof("Starting ilab convert process with job ID '%s'", jobID)
85+
if err := cmd.Start(); err != nil {
86+
logFile.Close()
87+
srv.log.Errorf("Error starting convert command: %v", err)
88+
return "", err
89+
}
90+
91+
// Create DB record for the job
92+
newJob := &Job{
93+
JobID: jobID,
94+
Cmd: ilabPath,
95+
Args: cmdArgs,
96+
Status: "running",
97+
PID: cmd.Process.Pid,
98+
LogFile: logFilePath,
99+
StartTime: time.Now(),
100+
}
101+
if err := srv.createJob(newJob); err != nil {
102+
srv.log.Errorf("Error creating convert job in DB: %v", err)
103+
}
104+
105+
go func() {
106+
defer logFile.Close()
107+
err := cmd.Wait()
108+
109+
newJob.Lock.Lock()
110+
defer newJob.Lock.Unlock()
111+
112+
if err != nil {
113+
newJob.Status = "failed"
114+
srv.log.Infof("Convert job %s failed: %v", newJob.JobID, err)
115+
} else if cmd.ProcessState.Success() {
116+
newJob.Status = "finished"
117+
srv.log.Infof("Convert job %s finished successfully", newJob.JobID)
118+
} else {
119+
newJob.Status = "failed"
120+
srv.log.Infof("Convert job %s failed (unknown reason)", newJob.JobID)
121+
}
122+
now := time.Now()
123+
newJob.EndTime = &now
124+
_ = srv.updateJob(newJob)
125+
}()
126+
127+
return jobID, nil
128+
}

api-server/main.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ func (srv *ILabServer) runServer(cmd *cobra.Command, args []string) {
270270
r.HandleFunc("/vllm-status", srv.getVllmStatusHandler).Methods("GET")
271271
r.HandleFunc("/gpu-free", srv.getGpuFreeHandler).Methods("GET")
272272
r.HandleFunc("/served-model-jobids", srv.listServedModelJobIDsHandler).Methods("GET")
273+
r.HandleFunc("/model/convert", srv.convertModelHandler).Methods("POST")
273274

274275
srv.log.Info("Server starting on port 8080... (Taxonomy path: ", srv.taxonomyPath, ")")
275276
if err := http.ListenAndServe("0.0.0.0:8080", r); err != nil {
@@ -510,19 +511,17 @@ func (srv *ILabServer) startTrainJob(modelName, branchName string, epochs *int)
510511
}
511512

512513
if srv.pipelineType == "simple" && !srv.rhelai {
513-
// TODO: Works on RHEL not from ilab main. --model-path seems to only accept the repo/name here and not the full path. Commenting for now.
514-
//homeDir, err := os.UserHomeDir()
515-
//if err != nil {
516-
// return "", fmt.Errorf("failed to get user home directory: %v", err)
517-
//}
518-
//datasetDir := filepath.Join(homeDir, ".local", "share", "instructlab", "datasets")
514+
homeDir, err := os.UserHomeDir()
515+
if err != nil {
516+
return "", fmt.Errorf("failed to get user home directory: %v", err)
517+
}
518+
modelDir := filepath.Join(homeDir, ".cache", "instructlab", "models")
519519

520520
cmdArgs = []string{
521521
"model", "train",
522522
"--pipeline", srv.pipelineType,
523523
"--optimize-memory",
524-
//fmt.Sprintf("--data-path=%s", datasetDir), // Leaving commented out for now until the above todo is resolved.
525-
fmt.Sprintf("--model-path=%s", modelName),
524+
fmt.Sprintf("--gguf-model-path=%s/%s", modelDir, modelName),
526525
}
527526
if srv.isOSX {
528527
cmdArgs = append(cmdArgs, "--device=mps")
@@ -561,7 +560,8 @@ func (srv *ILabServer) startTrainJob(modelName, branchName string, epochs *int)
561560
}
562561
}
563562

564-
srv.log.Infof("[ILAB TRAIN COMMAND] %s %v", ilabPath, cmdArgs)
563+
finalCmdString := fmt.Sprintf("[ILAB TRAIN COMMAND] %s %v", ilabPath, cmdArgs)
564+
srv.log.Info(finalCmdString)
565565

566566
cmd := exec.Command(ilabPath, cmdArgs...)
567567
if !srv.rhelai {
@@ -577,12 +577,15 @@ func (srv *ILabServer) startTrainJob(modelName, branchName string, epochs *int)
577577
cmd.Stdout = logFile
578578
cmd.Stderr = logFile
579579

580+
fmt.Fprintln(logFile, finalCmdString)
581+
580582
srv.log.Infof("[ILAB TRAIN COMMAND] %s %v", ilabPath, cmdArgs)
581583
if err := cmd.Start(); err != nil {
582584
return "", fmt.Errorf("error starting training command: %v", err)
583585
}
584586
srv.log.Infof("Training process started with PID: %d", cmd.Process.Pid)
585587

588+
// Create a DB record for this job
586589
newJob := &Job{
587590
JobID: jobID,
588591
Cmd: ilabPath,
@@ -597,6 +600,7 @@ func (srv *ILabServer) startTrainJob(modelName, branchName string, epochs *int)
597600
return "", fmt.Errorf("failed to create job in DB: %v", err)
598601
}
599602

603+
// Wait in a goroutine for the job to complete
600604
go func() {
601605
defer logFile.Close()
602606
err := cmd.Wait()

0 commit comments

Comments
 (0)