-
Notifications
You must be signed in to change notification settings - Fork 277
feat(config): watch config file and hot-reload router without restart #84
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| package config_test | ||
|
|
||
| import ( | ||
| "os" | ||
| "path/filepath" | ||
| "runtime" | ||
|
|
||
| . "github.com/onsi/ginkgo/v2" | ||
| . "github.com/onsi/gomega" | ||
|
|
||
| "github.com/vllm-project/semantic-router/semantic-router/pkg/config" | ||
| ) | ||
|
|
||
| var _ = Describe("ParseConfigFile and ReplaceGlobalConfig", func() { | ||
| var tempDir string | ||
|
|
||
| BeforeEach(func() { | ||
| var err error | ||
| tempDir, err = os.MkdirTemp("", "config_parse_test") | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| }) | ||
|
|
||
| AfterEach(func() { | ||
| os.RemoveAll(tempDir) | ||
| config.ResetConfig() | ||
| }) | ||
|
|
||
| It("should parse configuration via symlink path", func() { | ||
| if runtime.GOOS == "windows" { | ||
| Skip("symlink test is skipped on Windows") | ||
| } | ||
|
|
||
| // Create real config target | ||
| target := filepath.Join(tempDir, "real-config.yaml") | ||
| content := []byte("default_model: test-model\n") | ||
| Expect(os.WriteFile(target, content, 0o644)).To(Succeed()) | ||
|
|
||
| // Create symlink pointing to target | ||
| link := filepath.Join(tempDir, "link-config.yaml") | ||
| Expect(os.Symlink(target, link)).To(Succeed()) | ||
|
|
||
| cfg, err := config.ParseConfigFile(link) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Expect(cfg).NotTo(BeNil()) | ||
| Expect(cfg.DefaultModel).To(Equal("test-model")) | ||
| }) | ||
|
|
||
| It("should return error when file does not exist", func() { | ||
| _, err := config.ParseConfigFile(filepath.Join(tempDir, "no-such.yaml")) | ||
| Expect(err).To(HaveOccurred()) | ||
| Expect(err.Error()).To(ContainSubstring("failed to read config file")) | ||
| }) | ||
|
|
||
| It("should replace global config and reflect via GetConfig", func() { | ||
| // new config instance | ||
| newCfg := &config.RouterConfig{DefaultModel: "new-default"} | ||
| config.ReplaceGlobalConfig(newCfg) | ||
| got := config.GetConfig() | ||
| Expect(got).To(Equal(newCfg)) | ||
| Expect(got.DefaultModel).To(Equal("new-default")) | ||
| }) | ||
| }) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,22 +1,28 @@ | ||
| package extproc | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "log" | ||
| "net" | ||
| "os" | ||
| "os/signal" | ||
| "path/filepath" | ||
| "sync/atomic" | ||
| "syscall" | ||
| "time" | ||
|
|
||
| ext_proc "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" | ||
| "github.com/fsnotify/fsnotify" | ||
| "github.com/vllm-project/semantic-router/semantic-router/pkg/observability" | ||
| "google.golang.org/grpc" | ||
| ) | ||
|
|
||
| // Server represents a gRPC server for the Envoy ExtProc | ||
| type Server struct { | ||
| router *OpenAIRouter | ||
| server *grpc.Server | ||
| port int | ||
| configPath string | ||
| service *RouterService | ||
| server *grpc.Server | ||
| port int | ||
| } | ||
|
|
||
| // NewServer creates a new ExtProc gRPC server | ||
|
|
@@ -26,9 +32,11 @@ func NewServer(configPath string, port int) (*Server, error) { | |
| return nil, err | ||
| } | ||
|
|
||
| service := NewRouterService(router) | ||
| return &Server{ | ||
| router: router, | ||
| port: port, | ||
| configPath: configPath, | ||
| service: service, | ||
| port: port, | ||
| }, nil | ||
| } | ||
|
|
||
|
|
@@ -40,21 +48,26 @@ func (s *Server) Start() error { | |
| } | ||
|
|
||
| s.server = grpc.NewServer() | ||
| ext_proc.RegisterExternalProcessorServer(s.server, s.router) | ||
| ext_proc.RegisterExternalProcessorServer(s.server, s.service) | ||
|
|
||
| log.Printf("Starting LLM Router ExtProc server on port %d...", s.port) | ||
| observability.Infof("Starting LLM Router ExtProc server on port %d...", s.port) | ||
|
|
||
| // Run the server in a separate goroutine | ||
| serverErrCh := make(chan error, 1) | ||
| go func() { | ||
| if err := s.server.Serve(lis); err != nil && err != grpc.ErrServerStopped { | ||
| log.Printf("Server error: %v", err) | ||
| observability.Errorf("Server error: %v", err) | ||
| serverErrCh <- err | ||
| } else { | ||
| serverErrCh <- nil | ||
| } | ||
| }() | ||
|
|
||
| // Start config file watcher in background | ||
| ctx, cancel := context.WithCancel(context.Background()) | ||
| defer cancel() | ||
| go s.watchConfigAndReload(ctx) | ||
|
|
||
| // Wait for interrupt signal to gracefully shut down the server | ||
| signalChan := make(chan os.Signal, 1) | ||
| signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) | ||
|
|
@@ -63,11 +76,11 @@ func (s *Server) Start() error { | |
| select { | ||
| case err := <-serverErrCh: | ||
| if err != nil { | ||
| log.Printf("Server exited with error: %v", err) | ||
| observability.Errorf("Server exited with error: %v", err) | ||
| return err | ||
| } | ||
| case <-signalChan: | ||
| log.Println("Received shutdown signal, gracefully stopping server...") | ||
| observability.Infof("Received shutdown signal, gracefully stopping server...") | ||
| } | ||
|
|
||
| s.Stop() | ||
|
|
@@ -78,6 +91,105 @@ func (s *Server) Start() error { | |
| func (s *Server) Stop() { | ||
| if s.server != nil { | ||
| s.server.GracefulStop() | ||
| log.Println("Server stopped") | ||
| observability.Infof("Server stopped") | ||
| } | ||
| } | ||
|
|
||
| // RouterService is a delegating gRPC service that forwards to the current router implementation. | ||
| type RouterService struct { | ||
| current atomic.Pointer[OpenAIRouter] | ||
| } | ||
|
|
||
| func NewRouterService(r *OpenAIRouter) *RouterService { | ||
| rs := &RouterService{} | ||
| rs.current.Store(r) | ||
| return rs | ||
| } | ||
|
|
||
| // Swap replaces the current router implementation. | ||
| func (rs *RouterService) Swap(r *OpenAIRouter) { rs.current.Store(r) } | ||
|
|
||
| // Process delegates to the current router. | ||
| func (rs *RouterService) Process(stream ext_proc.ExternalProcessor_ProcessServer) error { | ||
| r := rs.current.Load() | ||
| return r.Process(stream) | ||
| } | ||
|
|
||
| // watchConfigAndReload watches the config file and reloads router on changes. | ||
| func (s *Server) watchConfigAndReload(ctx context.Context) { | ||
| watcher, err := fsnotify.NewWatcher() | ||
| if err != nil { | ||
| observability.LogEvent("config_watcher_error", map[string]interface{}{ | ||
| "stage": "create_watcher", | ||
| "error": err.Error(), | ||
| }) | ||
| return | ||
| } | ||
| defer watcher.Close() | ||
|
|
||
| cfgFile := s.configPath | ||
| cfgDir := filepath.Dir(cfgFile) | ||
|
|
||
| // Watch both the file and its directory to handle symlink swaps (Kubernetes ConfigMap) | ||
| if err := watcher.Add(cfgDir); err != nil { | ||
| observability.LogEvent("config_watcher_error", map[string]interface{}{ | ||
| "stage": "watch_dir", | ||
| "dir": cfgDir, | ||
| "error": err.Error(), | ||
| }) | ||
| return | ||
| } | ||
| _ = watcher.Add(cfgFile) // best-effort; may fail if file replaced by symlink later | ||
|
|
||
| // Debounce events | ||
| var ( | ||
| pending bool | ||
| last time.Time | ||
| ) | ||
|
|
||
| reload := func() { | ||
| // Parse and build a new router | ||
| newRouter, err := NewOpenAIRouter(cfgFile) | ||
| if err != nil { | ||
| observability.LogEvent("config_reload_failed", map[string]interface{}{ | ||
| "file": cfgFile, | ||
| "error": err.Error(), | ||
| }) | ||
| return | ||
| } | ||
| s.service.Swap(newRouter) | ||
| observability.LogEvent("config_reloaded", map[string]interface{}{ | ||
| "file": cfgFile, | ||
| }) | ||
| } | ||
|
|
||
| for { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a followup pr to test config file update under different scenarios? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #92 for tracking |
||
| select { | ||
| case <-ctx.Done(): | ||
| return | ||
| case ev, ok := <-watcher.Events: | ||
| if !ok { | ||
| return | ||
| } | ||
| if ev.Op&(fsnotify.Write|fsnotify.Create|fsnotify.Rename|fsnotify.Remove|fsnotify.Chmod) != 0 { | ||
| // If the event pertains to the config file or directory, trigger debounce | ||
| if filepath.Base(ev.Name) == filepath.Base(cfgFile) || filepath.Dir(ev.Name) == cfgDir { | ||
| if !pending || time.Since(last) > 250*time.Millisecond { | ||
| pending = true | ||
| last = time.Now() | ||
| // Slight delay to let file settle | ||
| go func() { time.Sleep(300 * time.Millisecond); reload() }() | ||
| } | ||
| } | ||
| } | ||
| case err, ok := <-watcher.Errors: | ||
| if !ok { | ||
| return | ||
| } | ||
| observability.LogEvent("config_watcher_error", map[string]interface{}{ | ||
| "stage": "watch_loop", | ||
| "error": err.Error(), | ||
| }) | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.