@@ -29,7 +29,6 @@ import (
2929	"github.com/prometheus/client_golang/prometheus" 
3030	"github.com/valyala/fasthttp" 
3131	"golang.org/x/sync/errgroup" 
32- 	"gopkg.in/yaml.v3" 
3332	"k8s.io/klog/v2" 
3433
3534	"github.com/llm-d/llm-d-inference-sim/pkg/common" 
@@ -260,69 +259,6 @@ func (s *VllmSimulator) initDataset() error {
260259	return  nil 
261260}
262261
263- func  (s  * VllmSimulator ) newListener () (net.Listener , error ) {
264- 	s .logger .Info ("Server starting" , "port" , s .config .Port )
265- 	listener , err  :=  net .Listen ("tcp4" , fmt .Sprintf (":%d" , s .config .Port ))
266- 	if  err  !=  nil  {
267- 		return  nil , err 
268- 	}
269- 	return  listener , nil 
270- }
271- 
272- // startServer starts http server on port defined in command line 
273- func  (s  * VllmSimulator ) startServer (ctx  context.Context , listener  net.Listener ) error  {
274- 	r  :=  fasthttprouter .New ()
275- 
276- 	// support completion APIs 
277- 	r .POST ("/v1/chat/completions" , s .HandleChatCompletions )
278- 	r .POST ("/v1/completions" , s .HandleTextCompletions )
279- 	// supports /models API 
280- 	r .GET ("/v1/models" , s .HandleModels )
281- 	// support load/unload of lora adapter 
282- 	r .POST ("/v1/load_lora_adapter" , s .HandleLoadLora )
283- 	r .POST ("/v1/unload_lora_adapter" , s .HandleUnloadLora )
284- 	// supports /metrics prometheus API 
285- 	r .GET ("/metrics" , fasthttpadaptor .NewFastHTTPHandler (promhttp .HandlerFor (s .registry , promhttp.HandlerOpts {})))
286- 	// supports standard Kubernetes health and readiness checks 
287- 	r .GET ("/health" , s .HandleHealth )
288- 	r .GET ("/ready" , s .HandleReady )
289- 	r .POST ("/tokenize" , s .HandleTokenize )
290- 
291- 	server  :=  fasthttp.Server {
292- 		ErrorHandler : s .HandleError ,
293- 		Handler :      r .Handler ,
294- 		Logger :       s ,
295- 	}
296- 
297- 	// Start server in a goroutine 
298- 	serverErr  :=  make (chan  error , 1 )
299- 	go  func () {
300- 		s .logger .Info ("HTTP server starting" )
301- 		serverErr  <-  server .Serve (listener )
302- 	}()
303- 
304- 	// Wait for either context cancellation or server error 
305- 	select  {
306- 	case  <- ctx .Done ():
307- 		s .logger .Info ("Shutdown signal received, shutting down HTTP server gracefully" )
308- 
309- 		// Gracefully shutdown the server 
310- 		if  err  :=  server .Shutdown (); err  !=  nil  {
311- 			s .logger .Error (err , "Error during server shutdown" )
312- 			return  err 
313- 		}
314- 
315- 		s .logger .Info ("HTTP server stopped" )
316- 		return  nil 
317- 
318- 	case  err  :=  <- serverErr :
319- 		if  err  !=  nil  {
320- 			s .logger .Error (err , "HTTP server failed" )
321- 		}
322- 		return  err 
323- 	}
324- }
325- 
326262// Print prints to a log, implementation of fasthttp.Logger 
327263func  (s  * VllmSimulator ) Printf (format  string , args  ... interface {}) {
328264	s .logger .Info ("Server error" , "msg" , fmt .Sprintf (format , args ... ))
@@ -594,66 +530,6 @@ func (s *VllmSimulator) createModelsResponse() *vllmapi.ModelsResponse {
594530	return  & modelsResp 
595531}
596532
597- // HandleHealth http handler for /health 
598- func  (s  * VllmSimulator ) HandleHealth (ctx  * fasthttp.RequestCtx ) {
599- 	s .logger .V (4 ).Info ("health request received" )
600- 	ctx .Response .Header .SetContentType ("application/json" )
601- 	ctx .Response .Header .SetStatusCode (fasthttp .StatusOK )
602- 	ctx .Response .SetBody ([]byte ("{}" ))
603- }
604- 
605- // HandleReady http handler for /ready 
606- func  (s  * VllmSimulator ) HandleReady (ctx  * fasthttp.RequestCtx ) {
607- 	s .logger .V (4 ).Info ("readiness request received" )
608- 	ctx .Response .Header .SetContentType ("application/json" )
609- 	ctx .Response .Header .SetStatusCode (fasthttp .StatusOK )
610- 	ctx .Response .SetBody ([]byte ("{}" ))
611- }
612- 
613- // getDisplayedModelName returns the model name that must appear in API 
614- // responses.  LoRA adapters keep their explicit name, while all base-model 
615- // requests are surfaced as the first alias from --served-model-name. 
616- func  (s  * VllmSimulator ) getDisplayedModelName (reqModel  string ) string  {
617- 	if  s .isLora (reqModel ) {
618- 		return  reqModel 
619- 	}
620- 	return  s .config .ServedModelNames [0 ]
621- }
622- 
623- func  (s  * VllmSimulator ) showConfig (dp  bool ) error  {
624- 	cfgYAML , err  :=  yaml .Marshal (s .config )
625- 	if  err  !=  nil  {
626- 		return  fmt .Errorf ("failed to marshal configuration to YAML: %w" , err )
627- 	}
628- 
629- 	var  m  map [string ]interface {}
630- 	err  =  yaml .Unmarshal (cfgYAML , & m )
631- 	if  err  !=  nil  {
632- 		return  fmt .Errorf ("failed to unmarshal YAML to map: %w" , err )
633- 	}
634- 	if  dp  {
635- 		// remove the port 
636- 		delete (m , "port" )
637- 	}
638- 	// clean LoraModulesString field 
639- 	m ["lora-modules" ] =  m ["LoraModules" ]
640- 	delete (m , "LoraModules" )
641- 	delete (m , "LoraModulesString" )
642- 
643- 	// clean fake-metrics field 
644- 	if  field , ok  :=  m ["fake-metrics" ].(map [string ]interface {}); ok  {
645- 		delete (field , "LorasString" )
646- 	}
647- 
648- 	// show in YAML 
649- 	cfgYAML , err  =  yaml .Marshal (m )
650- 	if  err  !=  nil  {
651- 		return  fmt .Errorf ("failed to marshal configuration to YAML: %w" , err )
652- 	}
653- 	s .logger .Info ("Configuration:" , "" , string (cfgYAML ))
654- 	return  nil 
655- }
656- 
657533func  (s  * VllmSimulator ) getCurrFactor () float64  {
658534	if  s .config .MaxNumSeqs  <=  1  {
659535		return  1.0 
@@ -676,31 +552,3 @@ func (s *VllmSimulator) GetPrefillTimePerToken() int {
676552func  (s  * VllmSimulator ) GetInterTokenLatency () int  {
677553	return  int (float64 (s .config .InterTokenLatency ) *  s .getCurrFactor ())
678554}
679- << << << <  HEAD 
680- 
681- // generateTokens creates and returns response payload based on this request, 
682- // i.e., an array of generated tokens, the finish reason, and the number of generated tokens 
683- func  (s  * VllmSimulator ) generateTokens (req  openaiserverapi.CompletionRequest ) ([]string , string , int , error ) {
684- 	ignoreEOS  :=  req .GetIgnoreEOS ()
685- 	var  maxTokens  * int64 
686- 	var  prompt  string 
687- 
688- 	if  chatReq , ok  :=  req .(* openaiserverapi.ChatCompletionRequest ); ok  {
689- 		maxTokens  =  chatReq .GetMaxCompletionTokens ()
690- 		prompt  =  chatReq .GetLastUserMsg ()
691- 	} else  if  textReq , ok  :=  req .(* openaiserverapi.TextCompletionRequest ); ok  {
692- 		maxTokens  =  textReq .MaxTokens 
693- 		prompt  =  textReq .GetPrompt ()
694- 	} else  {
695- 		return  nil , "" , 0 , fmt .Errorf ("unknown request type: %T" , req )
696- 	}
697- 
698- 	var  finishReason  string 
699- 	var  tokens  []string 
700- 	if  s .config .Mode  ==  common .ModeEcho  {
701- 		tokens , finishReason  =  common .EchoResponseTokens (maxTokens , prompt )
702- 		return  tokens , finishReason , len (tokens ), nil 
703- 	}
704- 	tokens , finishReason  =  common .GetRandomTokens (maxTokens , ignoreEOS , s .dataset )
705- 	return  tokens , finishReason , len (tokens ), nil 
706- }
0 commit comments