@@ -2,6 +2,7 @@ package main
22
33import (
44 "bytes"
5+ "database/sql"
56 "encoding/json"
67 "fmt"
78 "github.com/gorilla/mux"
@@ -243,12 +244,21 @@ func (srv *ILabServer) getVllmStatusHandler(w http.ResponseWriter, r *http.Reque
243244 return
244245 }
245246
246- srv .jobIDsMutex .RLock ()
247- jobID , ok := srv .servedModelJobIDs [modelName ]
248- srv .jobIDsMutex .RUnlock ()
247+ // Directly query the DB for the job associated with this model
248+ var jobID string
249+ err = srv .db .QueryRow (`
250+ SELECT job_id
251+ FROM jobs
252+ WHERE served_model_name = ? AND status = 'running'
253+ LIMIT 1
254+ ` , modelName ).Scan (& jobID )
249255
250- if ! ok {
251- srv .log .Infof ("WTF jobid not found for model '%s'" , modelName )
256+ if err == sql .ErrNoRows {
257+ srv .log .Infof ("No running job found for model '%s'" , modelName )
258+ _ = json .NewEncoder (w ).Encode (map [string ]string {"status" : "loading" })
259+ return
260+ } else if err != nil {
261+ srv .log .Errorf ("Error querying job for model '%s': %v" , modelName , err )
252262 _ = json .NewEncoder (w ).Encode (map [string ]string {"status" : "loading" })
253263 return
254264 }
@@ -629,6 +639,26 @@ func (srv *ILabServer) runVllmContainerHandler(
629639 gpuIndex int , hostVolume , containerVolume string ,
630640 w http.ResponseWriter ,
631641) {
642+ // Check if a job is already running for the requested model
643+ existingJob , err := srv .getRunningJobByModel (servedModelName )
644+ if err != nil {
645+ srv .log .Errorf ("Error checking existing jobs for model '%s': %v" , servedModelName , err )
646+ http .Error (w , "Internal server error" , http .StatusInternalServerError )
647+ return
648+ }
649+ if existingJob != nil {
650+ srv .log .Infof ("A job is already running for model '%s' with job_id: %s" , servedModelName , existingJob .JobID )
651+ w .Header ().Set ("Content-Type" , "application/json" )
652+ _ = json .NewEncoder (w ).Encode (map [string ]string {
653+ "status" : "already_running" ,
654+ "job_id" : existingJob .JobID ,
655+ "message" : fmt .Sprintf ("Model '%s' is already being served." , servedModelName ),
656+ })
657+ return
658+ }
659+
660+ srv .log .Infof ("No existing job found for model '%s'. Starting a new job." , servedModelName )
661+
632662 cmdArgs := []string {
633663 "run" , "--rm" ,
634664 fmt .Sprintf ("--device=nvidia.com/gpu=%d" , gpuIndex ),
@@ -681,13 +711,14 @@ func (srv *ILabServer) runVllmContainerHandler(
681711
682712 // Create a Job record and store it in the DB
683713 newJob := & Job {
684- JobID : jobID ,
685- Cmd : "podman" ,
686- Args : cmdArgs ,
687- Status : "running" ,
688- PID : cmd .Process .Pid ,
689- LogFile : logFilePath ,
690- StartTime : time .Now (),
714+ JobID : jobID ,
715+ Cmd : "podman" ,
716+ Args : cmdArgs ,
717+ Status : "running" ,
718+ PID : cmd .Process .Pid ,
719+ LogFile : logFilePath ,
720+ StartTime : time .Now (),
721+ ServedModelName : servedModelName ,
691722 }
692723 if err := srv .createJob (newJob ); err != nil {
693724 srv .log .Errorf ("Failed to create job in DB for %s: %v" , jobID , err )
@@ -859,6 +890,59 @@ func (srv *ILabServer) serveModelHandler(modelPath, port string, w http.Response
859890 _ = json .NewEncoder (w ).Encode (map [string ]string {"status" : "model process started" , "job_id" : jobID })
860891}
861892
893+ // getRunningJobByModel retrieves a running job for the specified served_model_name.
894+ // Returns nil if no such job exists.
895+ func (srv * ILabServer ) getRunningJobByModel (servedModelName string ) (* Job , error ) {
896+ var job Job
897+ var argsJSON string
898+ var startTimeStr , endTimeStr sql.NullString
899+
900+ row := srv .db .QueryRow (`
901+ SELECT job_id, cmd, args, status, pid, log_file, start_time, end_time, branch, served_model_name
902+ FROM jobs
903+ WHERE served_model_name = ? AND status = 'running'
904+ LIMIT 1
905+ ` , servedModelName )
906+
907+ err := row .Scan (
908+ & job .JobID ,
909+ & job .Cmd ,
910+ & argsJSON ,
911+ & job .Status ,
912+ & job .PID ,
913+ & job .LogFile ,
914+ & startTimeStr ,
915+ & endTimeStr ,
916+ & job .Branch ,
917+ & job .ServedModelName ,
918+ )
919+ if err == sql .ErrNoRows {
920+ return nil , nil
921+ } else if err != nil {
922+ return nil , err
923+ }
924+
925+ if err := json .Unmarshal ([]byte (argsJSON ), & job .Args ); err != nil {
926+ srv .log .Errorf ("Failed to unmarshal Args for job '%s': %v" , job .JobID , err )
927+ return nil , fmt .Errorf ("failed to unmarshal Args for job '%s': %v" , job .JobID , err )
928+ }
929+
930+ if startTimeStr .Valid {
931+ t , err := time .Parse (time .RFC3339 , startTimeStr .String )
932+ if err == nil {
933+ job .StartTime = t
934+ }
935+ }
936+ if endTimeStr .Valid && endTimeStr .String != "" {
937+ t , err := time .Parse (time .RFC3339 , endTimeStr .String )
938+ if err == nil {
939+ job .EndTime = & t
940+ }
941+ }
942+
943+ return & job , nil
944+ }
945+
862946// listServedModelJobIDsHandler is a debug endpoint to list current model to jobID mappings.
863947func (srv * ILabServer ) listServedModelJobIDsHandler (w http.ResponseWriter , r * http.Request ) {
864948 srv .jobIDsMutex .RLock ()
0 commit comments