From ce80821a28223490c25a20e4703b4cede7cd0c4a Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 09:20:59 +0000 Subject: [PATCH 01/13] CBG-4714: collect goroutine stack traces on signal --- base/util.go | 43 ++++++++++++++++++++ rest/adminapitest/admin_api_test.go | 11 ++++++ rest/api.go | 8 ++++ rest/routing.go | 2 + rest/server_context.go | 61 +++++++++++++++++++++++++++++ rest/server_context_test.go | 28 +++++++++++++ rest/stats_context.go | 23 +++-------- tools-tests/sgcollect_info_test.py | 22 +++++++++++ tools/sgcollect.py | 26 ++++++++++++ 9 files changed, 206 insertions(+), 18 deletions(-) diff --git a/base/util.go b/base/util.go index 1b6daa9553..652b1826b9 100644 --- a/base/util.go +++ b/base/util.go @@ -27,6 +27,8 @@ import ( "net" "net/http" "net/url" + "os" + "path/filepath" "reflect" "regexp" "runtime" @@ -1831,3 +1833,44 @@ func IsRevTreeID(s string) bool { } return false } + +func GetStackTrace() string { + // this nees logging in to warmn if we don;t successfully grab the stack in 10 tries + // mnaybe use some switch retyr logic + buf := make([]byte, 1<<20) + count := 0 + for count < 10 { + n := runtime.Stack(buf, true) + fmt.Println("n", n) + if n < len(buf) { + buf = buf[:n] + break + } + buf = make([]byte, 2*len(buf)) + count++ + } + return string(buf) +} + +func RotateProfilesIfNeeded(filename string) error { + existingFiles, err := filepath.Glob(filename) + if err != nil { + return fmt.Errorf("Error listing existing profiles in %q: %w", filename, err) + } + if len(existingFiles) <= 10 { + return nil + } + slices.Reverse(existingFiles) + var multiErr *MultiError + for _, profile := range existingFiles[10:] { + err = os.Remove(profile) + if err != nil { + multiErr = multiErr.Append(fmt.Errorf("Error removing old profile %q: %w", profile, err)) + } + } + return multiErr.ErrorOrNil() +} + +func CreateFileInLoggingDirectory(filename string) (*os.File, error) { + return os.Create(filename) +} diff --git a/rest/adminapitest/admin_api_test.go b/rest/adminapitest/admin_api_test.go index 89045ba4cb..1c41fe7a1e 100644 --- a/rest/adminapitest/admin_api_test.go +++ b/rest/adminapitest/admin_api_test.go @@ -2269,6 +2269,17 @@ func TestHandleSGCollect(t *testing.T) { rest.RequireStatus(t, resp, http.StatusBadRequest) } +func TestHandleGetStackTrace(t *testing.T) { + rt := rest.NewRestTester(t, nil) + defer rt.Close() + + resp := rt.SendAdminRequest(http.MethodGet, "/_debug/stacktrace", "") + rest.RequireStatus(t, resp, http.StatusOK) + rawResponseStr := resp.Body.String() + assert.Contains(t, rawResponseStr, "goroutine") + assert.Contains(t, rawResponseStr, "handleCollectStackTrace") +} + func TestConfigRedaction(t *testing.T) { base.LongRunningTest(t) diff --git a/rest/api.go b/rest/api.go index 7a589d7de1..0adb2cff3d 100644 --- a/rest/api.go +++ b/rest/api.go @@ -693,6 +693,14 @@ func (h *handler) handleFgprof() error { return stopFn() } +func (h *handler) handleCollectStackTrace() error { + + stackTrace := base.GetStackTrace() + + h.writeText([]byte(stackTrace)) + return nil +} + func (h *handler) handlePprofBlock() error { sec, err := strconv.ParseInt(h.rq.FormValue("seconds"), 10, 64) if sec <= 0 || err != nil { diff --git a/rest/routing.go b/rest/routing.go index 6504ddcc74..2458a18b5a 100644 --- a/rest/routing.go +++ b/rest/routing.go @@ -316,6 +316,8 @@ func CreateAdminRouter(sc *ServerContext) *mux.Router { makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePprofTrace, handlerOptions{sgcollect: true})).Methods("GET", "POST") r.Handle("/_debug/fgprof", makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleFgprof, handlerOptions{sgcollect: true})).Methods("GET", "POST") + r.Handle("/_debug/stacktrace", + makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleCollectStackTrace, handlerOptions{sgcollect: true})).Methods("GET") r.Handle("/_post_upgrade", makeHandler(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePostUpgrade)).Methods("POST") diff --git a/rest/server_context.go b/rest/server_context.go index 96acc45b77..0c8950d902 100644 --- a/rest/server_context.go +++ b/rest/server_context.go @@ -20,12 +20,15 @@ import ( "net" "net/http" "os" + "os/signal" + "path/filepath" "slices" "sort" "strconv" "strings" "sync" "sync/atomic" + "syscall" "time" "github.com/KimMachineGun/automemlimit/memlimit" @@ -59,6 +62,8 @@ var ErrSuspendingDisallowed = errors.New("database does not allow suspending") var allServers = []serverType{publicServer, adminServer, metricsServer, diagnosticServer} +const stackFilePrefix = "sg_stack_trace_" + // serverInfo represents an instance of an HTTP server from sync gateway type serverInfo struct { server *http.Server // server is the HTTP server instance @@ -204,9 +209,33 @@ func NewServerContext(ctx context.Context, config *StartupConfig, persistentConf sc.startStatsLogger(ctx) + sc.registerSignalHandlerForStackTrace(ctx) + return sc } +// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces +// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines. +func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) { + signalChannel := make(chan os.Signal, 1) + signal.Notify(signalChannel, syscall.SIGUSR1) + + go func() { + for sig := range signalChannel { + base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) + switch sig { + case syscall.SIGUSR1: + // stack trace signal received + currentTime := time.Now() + timestamp := currentTime.Format(time.RFC3339) + sc.logStackTraces(ctx, timestamp) + default: + // unhandled signal here + } + } + }() +} + func (sc *ServerContext) WaitForRESTAPIs(ctx context.Context) error { timeout := 30 * time.Second interval := time.Millisecond * 100 @@ -1844,6 +1873,38 @@ func (sc *ServerContext) logStats(ctx context.Context) error { } +func (sc *ServerContext) logStackTraces(ctx context.Context, timestamp string) { + + base.InfofCtx(ctx, base.KeyAll, "Collecting stack trace for all goroutines") + stackTrace := base.GetStackTrace() + + // log to console + _, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace) + + filename := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+timestamp+".log") + file, err := base.CreateFileInLoggingDirectory(filename) + defer func() { + err = file.Close() + if err != nil { + base.WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, err) + } + }() + if err != nil { + base.DebugfCtx(ctx, base.KeyAll, "Error opening stack trace file %s: %v", filename, err) + } + + _, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace)) + if err != nil { + base.DebugfCtx(ctx, base.KeyAll, "Error writing stack trace to file %s: %v", filename, err) + } + + rotatePath := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+"*.log") + err = base.RotateProfilesIfNeeded(rotatePath) + if err != nil { + base.DebugfCtx(ctx, base.KeyAll, "Error rotating stack trace files in path %s: %v", rotatePath, err) + } +} + func (sc *ServerContext) logNetworkInterfaceStats(ctx context.Context) { if err := sc.statsContext.addPublicNetworkInterfaceStatsForHostnamePort(sc.Config.API.PublicInterface); err != nil { diff --git a/rest/server_context_test.go b/rest/server_context_test.go index 9ad335e70a..54cf3ccd6b 100644 --- a/rest/server_context_test.go +++ b/rest/server_context_test.go @@ -16,6 +16,7 @@ import ( "net/http" "os" "path/filepath" + "slices" "strconv" "strings" "sync" @@ -1062,3 +1063,30 @@ func TestDatabaseCollectionDeletedErrorState(t *testing.T) { allDbs = rt.ServerContext().allDatabaseSummaries() require.Len(t, allDbs, 1) } + +func TestCollectStackTraceFile(t *testing.T) { + base.SetUpTestLogging(t, base.LevelInfo, base.KeyAll) + + tempPath := t.TempDir() + serverConfig := DefaultStartupConfig(tempPath) + serverConfig.Logging.LogFilePath = tempPath + ctx := t.Context() + serverContext := NewServerContext(ctx, &serverConfig, false) + defer serverContext.Close(ctx) + + timeStamp := "01" + serverContext.logStackTraces(ctx, timeStamp) + require.Len(t, getFilenames(t, tempPath), 1) + slices.Contains(getFilenames(t, tempPath), stackFilePrefix+timeStamp+".log") + + // trigger rotation and assert we don't go above 10 files + expectedFiles := make([]string, 0, 10) + for i := 2; i < 12; i++ { + timeStamp = fmt.Sprintf("%d", i+2) + serverContext.logStackTraces(ctx, timeStamp) + expectedFiles = append(expectedFiles, stackFilePrefix+timeStamp+".log") + } + files := getFilenames(t, tempPath) + require.Len(t, files, 10) + require.ElementsMatch(t, files, expectedFiles) +} diff --git a/rest/stats_context.go b/rest/stats_context.go index 7d48d8d0cf..ea8718974d 100644 --- a/rest/stats_context.go +++ b/rest/stats_context.go @@ -18,7 +18,6 @@ import ( "path/filepath" "runtime" "runtime/pprof" - "slices" "sync/atomic" "time" @@ -393,7 +392,7 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp memoryProfile := pprof.Lookup("heap") filename := filepath.Join(outputDir, pprofPrefix+timestamp+".pb.gz") - file, err := os.Create(filename) + file, err := base.CreateFileInLoggingDirectory(filename) defer func() { err = file.Close() if err != nil { @@ -407,20 +406,8 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp if err != nil { return fmt.Errorf("Error writing memory profile to %q: %w", filename, err) } - existingProfiles, err := filepath.Glob(filepath.Join(outputDir, pprofPrefix+"*.pb.gz")) - if err != nil { - return fmt.Errorf("Error listing existing memory profiles in %q: %w", outputDir, err) - } - if len(existingProfiles) <= 10 { - return nil - } - slices.Reverse(existingProfiles) - var multiErr *base.MultiError - for _, profile := range existingProfiles[10:] { - err = os.Remove(profile) - if err != nil { - multiErr = multiErr.Append(fmt.Errorf("Error removing old memory profile %q: %w", profile, err)) - } - } - return multiErr.ErrorOrNil() + + // rotate old profiles + path := filepath.Join(outputDir, pprofPrefix+"*.pb.gz") + return base.RotateProfilesIfNeeded(path) } diff --git a/tools-tests/sgcollect_info_test.py b/tools-tests/sgcollect_info_test.py index fb5195f9b9..2ee0b18eb2 100644 --- a/tools-tests/sgcollect_info_test.py +++ b/tools-tests/sgcollect_info_test.py @@ -72,6 +72,28 @@ def test_make_collect_logs_heap_profile(tmpdir): assert tasks[0].description.startswith("Contents of") +def test_make_collect_logs_stacktrace(tmpdir): + with unittest.mock.patch( + "sgcollect.urlopen", + return_value=io.BytesIO( + '{{"logfilepath": "{logpath}"}}'.format( + logpath=normalize_path_for_json(tmpdir), + ).encode("utf-8") + ), + ): + stacktrace_file = tmpdir.join("sg_stack_trace.log") + stacktrace_file.write("foo") + tasks = sgcollect.make_collect_logs_tasks( + sg_url="fakeurl", + sg_config_file_path="", + auth_headers={}, + ) + assert [tasks[0].log_file] == [stacktrace_file.basename] + # ensure that this is not redacted task + assert tasks[0].description.startswith("Contents of") + + + @pytest.mark.parametrize("should_redact", [True, False]) def test_make_collect_logs_tasks_duplicate_files(should_redact, tmp_path): tmpdir1 = tmp_path / "tmpdir1" diff --git a/tools/sgcollect.py b/tools/sgcollect.py index 63e7e30a25..fc0a7bbdeb 100755 --- a/tools/sgcollect.py +++ b/tools/sgcollect.py @@ -278,6 +278,26 @@ def make_http_client_pprof_tasks( return pprof_tasks +def make_http_client_stack_trace_task( + sg_url: str, auth_headers: dict[str, str] +) -> PythonTask: + """ + This task uses the python http client to collect the raw stack trace data + """ + stack_trace_url = "{0}/_debug/stacktrace".format(sg_url) + + stack_trace_task = make_curl_task( + name="Collect stack trace via http client", + auth_headers=auth_headers, + url=stack_trace_url, + log_file="sg_stack_trace.log", + ) + stack_trace_task.no_header = True + + return stack_trace_task + + + def to_lower_case_keys_dict(original_dict): result = {} @@ -367,6 +387,7 @@ def make_collect_logs_tasks( "sg_debug.log": "sg_debug.log", "sg_trace.log": "sg_trace.log", "sg_stats.log": "sg_stats.log", + #"sg_stack_trace.log": "sg_stack_trace.log", "sync_gateway_access.log": "sync_gateway_access.log", "sync_gateway_error.log": "sync_gateway_error.log", "pprof.pb": "pprof.pb", @@ -761,6 +782,10 @@ def make_sg_tasks( http_client_pprof_tasks = make_http_client_pprof_tasks(sg_url, auth_headers) + http_client_stack_trace_task = make_http_client_stack_trace_task( + sg_url, auth_headers + ) + # Add a task to collect Sync Gateway config config_tasks = make_config_tasks( sg_config_path, sg_url, auth_headers, should_redact @@ -781,6 +806,7 @@ def make_sg_tasks( collect_logs_tasks, py_expvar_task, http_client_pprof_tasks, + http_client_stack_trace_task, config_tasks, status_tasks, ] From 1beaae4233242d940a790c2fd88313cf8237ce98 Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 09:28:16 +0000 Subject: [PATCH 02/13] remove comment --- tools/sgcollect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sgcollect.py b/tools/sgcollect.py index fc0a7bbdeb..7a532a8205 100755 --- a/tools/sgcollect.py +++ b/tools/sgcollect.py @@ -387,7 +387,7 @@ def make_collect_logs_tasks( "sg_debug.log": "sg_debug.log", "sg_trace.log": "sg_trace.log", "sg_stats.log": "sg_stats.log", - #"sg_stack_trace.log": "sg_stack_trace.log", + "sg_stack_trace.log": "sg_stack_trace.log", "sync_gateway_access.log": "sync_gateway_access.log", "sync_gateway_error.log": "sync_gateway_error.log", "pprof.pb": "pprof.pb", From df92f5171b74769d3d89398bce76eb1375234c8f Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 11:22:05 +0000 Subject: [PATCH 03/13] fix formatting --- tools-tests/sgcollect_info_test.py | 13 ++++++------- tools/sgcollect.py | 3 +-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/tools-tests/sgcollect_info_test.py b/tools-tests/sgcollect_info_test.py index 2ee0b18eb2..6a5b2503e0 100644 --- a/tools-tests/sgcollect_info_test.py +++ b/tools-tests/sgcollect_info_test.py @@ -74,12 +74,12 @@ def test_make_collect_logs_heap_profile(tmpdir): def test_make_collect_logs_stacktrace(tmpdir): with unittest.mock.patch( - "sgcollect.urlopen", - return_value=io.BytesIO( - '{{"logfilepath": "{logpath}"}}'.format( - logpath=normalize_path_for_json(tmpdir), - ).encode("utf-8") - ), + "sgcollect.urlopen", + return_value=io.BytesIO( + '{{"logfilepath": "{logpath}"}}'.format( + logpath=normalize_path_for_json(tmpdir), + ).encode("utf-8") + ), ): stacktrace_file = tmpdir.join("sg_stack_trace.log") stacktrace_file.write("foo") @@ -93,7 +93,6 @@ def test_make_collect_logs_stacktrace(tmpdir): assert tasks[0].description.startswith("Contents of") - @pytest.mark.parametrize("should_redact", [True, False]) def test_make_collect_logs_tasks_duplicate_files(should_redact, tmp_path): tmpdir1 = tmp_path / "tmpdir1" diff --git a/tools/sgcollect.py b/tools/sgcollect.py index 7a532a8205..649bdb39ed 100755 --- a/tools/sgcollect.py +++ b/tools/sgcollect.py @@ -278,6 +278,7 @@ def make_http_client_pprof_tasks( return pprof_tasks + def make_http_client_stack_trace_task( sg_url: str, auth_headers: dict[str, str] ) -> PythonTask: @@ -297,8 +298,6 @@ def make_http_client_stack_trace_task( return stack_trace_task - - def to_lower_case_keys_dict(original_dict): result = {} for k, v in list(original_dict.items()): From 033a788a5bc19669ea4842983544c33fbb37e424 Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 11:33:28 +0000 Subject: [PATCH 04/13] some minor changes --- base/util.go | 15 ++++++++------- rest/server_context.go | 2 +- rest/stats_context.go | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/base/util.go b/base/util.go index 652b1826b9..fe4eefe801 100644 --- a/base/util.go +++ b/base/util.go @@ -1834,24 +1834,24 @@ func IsRevTreeID(s string) bool { return false } +// GetStackTrace will return goroutine stack traces for all goroutines in Sync Gateway. func GetStackTrace() string { - // this nees logging in to warmn if we don;t successfully grab the stack in 10 tries - // mnaybe use some switch retyr logic + // make 1MB buffer but if this buffer isn't big enough, runtime.Stack will + // return nothing, thus have 5 retires doubling the capacity each time. buf := make([]byte, 1<<20) - count := 0 - for count < 10 { + for range 5 { n := runtime.Stack(buf, true) - fmt.Println("n", n) if n < len(buf) { buf = buf[:n] break } buf = make([]byte, 2*len(buf)) - count++ } return string(buf) } +// RotateProfilesIfNeeded will remove old files if there are more than +// 10 matching the given filename pattern. func RotateProfilesIfNeeded(filename string) error { existingFiles, err := filepath.Glob(filename) if err != nil { @@ -1871,6 +1871,7 @@ func RotateProfilesIfNeeded(filename string) error { return multiErr.ErrorOrNil() } -func CreateFileInLoggingDirectory(filename string) (*os.File, error) { +// CreateFileInDirectory will create a file in directory specified by filename. +func CreateFileInDirectory(filename string) (*os.File, error) { return os.Create(filename) } diff --git a/rest/server_context.go b/rest/server_context.go index 0c8950d902..5229b8ae07 100644 --- a/rest/server_context.go +++ b/rest/server_context.go @@ -1882,7 +1882,7 @@ func (sc *ServerContext) logStackTraces(ctx context.Context, timestamp string) { _, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace) filename := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+timestamp+".log") - file, err := base.CreateFileInLoggingDirectory(filename) + file, err := base.CreateFileInDirectory(filename) defer func() { err = file.Close() if err != nil { diff --git a/rest/stats_context.go b/rest/stats_context.go index ea8718974d..2818959e26 100644 --- a/rest/stats_context.go +++ b/rest/stats_context.go @@ -392,7 +392,7 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp memoryProfile := pprof.Lookup("heap") filename := filepath.Join(outputDir, pprofPrefix+timestamp+".pb.gz") - file, err := base.CreateFileInLoggingDirectory(filename) + file, err := base.CreateFileInDirectory(filename) defer func() { err = file.Close() if err != nil { From 2e417035f31d85b48393a9f9dda8bfdd3b6e395f Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 12:17:30 +0000 Subject: [PATCH 05/13] fix leaking goroutine --- base/util.go | 5 ----- rest/server_context.go | 32 +++++++++++++++++++++++--------- rest/server_context_test.go | 2 +- rest/stats_context.go | 2 +- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/base/util.go b/base/util.go index fe4eefe801..49b5d7b944 100644 --- a/base/util.go +++ b/base/util.go @@ -1870,8 +1870,3 @@ func RotateProfilesIfNeeded(filename string) error { } return multiErr.ErrorOrNil() } - -// CreateFileInDirectory will create a file in directory specified by filename. -func CreateFileInDirectory(filename string) (*os.File, error) { - return os.Create(filename) -} diff --git a/rest/server_context.go b/rest/server_context.go index 5229b8ae07..cf882c3575 100644 --- a/rest/server_context.go +++ b/rest/server_context.go @@ -98,6 +98,7 @@ type ServerContext struct { DatabaseInitManager *DatabaseInitManager // Manages database initialization (index creation and readiness) independent of database stop/start/reload, when using persistent config ActiveReplicationsCounter invalidDatabaseConfigTracking invalidDatabaseConfigs + signalContextFunc context.CancelFunc // handle sgcollect processes for a given Server SGCollect *sgCollect } @@ -209,7 +210,9 @@ func NewServerContext(ctx context.Context, config *StartupConfig, persistentConf sc.startStatsLogger(ctx) - sc.registerSignalHandlerForStackTrace(ctx) + signalCtx, cancelFunc := context.WithCancel(ctx) + sc.signalContextFunc = cancelFunc + sc.registerSignalHandlerForStackTrace(signalCtx) return sc } @@ -220,8 +223,14 @@ func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) signalChannel := make(chan os.Signal, 1) signal.Notify(signalChannel, syscall.SIGUSR1) + defer func() { + signal.Stop(signalChannel) + close(signalChannel) + }() + go func() { - for sig := range signalChannel { + select { + case sig := <-signalChannel: base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) switch sig { case syscall.SIGUSR1: @@ -232,6 +241,8 @@ func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) default: // unhandled signal here } + case <-ctx.Done(): + return } }() } @@ -304,6 +315,9 @@ func (sc *ServerContext) Close(ctx context.Context) { base.InfofCtx(ctx, base.KeyAll, "Couldn't stop background config update worker: %v", err) } + // cancel any signal handlers + sc.signalContextFunc() + sc.lock.Lock() defer sc.lock.Unlock() @@ -1882,26 +1896,26 @@ func (sc *ServerContext) logStackTraces(ctx context.Context, timestamp string) { _, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace) filename := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+timestamp+".log") - file, err := base.CreateFileInDirectory(filename) + file, err := os.Create(filename) defer func() { - err = file.Close() - if err != nil { - base.WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, err) + closeErr := file.Close() + if closeErr != nil { + base.WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, closeErr) } }() if err != nil { - base.DebugfCtx(ctx, base.KeyAll, "Error opening stack trace file %s: %v", filename, err) + base.WarnfCtx(ctx, "Error opening stack trace file %s: %v", filename, err) } _, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace)) if err != nil { - base.DebugfCtx(ctx, base.KeyAll, "Error writing stack trace to file %s: %v", filename, err) + base.WarnfCtx(ctx, "Error writing stack trace to file %s: %v", filename, err) } rotatePath := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+"*.log") err = base.RotateProfilesIfNeeded(rotatePath) if err != nil { - base.DebugfCtx(ctx, base.KeyAll, "Error rotating stack trace files in path %s: %v", rotatePath, err) + base.WarnfCtx(ctx, "Error rotating stack trace files in path %s: %v", rotatePath, err) } } diff --git a/rest/server_context_test.go b/rest/server_context_test.go index 54cf3ccd6b..4e3ae1fef1 100644 --- a/rest/server_context_test.go +++ b/rest/server_context_test.go @@ -1077,7 +1077,7 @@ func TestCollectStackTraceFile(t *testing.T) { timeStamp := "01" serverContext.logStackTraces(ctx, timeStamp) require.Len(t, getFilenames(t, tempPath), 1) - slices.Contains(getFilenames(t, tempPath), stackFilePrefix+timeStamp+".log") + assert.True(t, slices.Contains(getFilenames(t, tempPath), stackFilePrefix+timeStamp+".log")) // trigger rotation and assert we don't go above 10 files expectedFiles := make([]string, 0, 10) diff --git a/rest/stats_context.go b/rest/stats_context.go index 2818959e26..4ec835b7f4 100644 --- a/rest/stats_context.go +++ b/rest/stats_context.go @@ -392,7 +392,7 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp memoryProfile := pprof.Lookup("heap") filename := filepath.Join(outputDir, pprofPrefix+timestamp+".pb.gz") - file, err := base.CreateFileInDirectory(filename) + file, err := os.Create(filename) defer func() { err = file.Close() if err != nil { From 0ec5e515bd5a00bda3a4530fe36416d6ff399d7d Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 12:28:46 +0000 Subject: [PATCH 06/13] add api docs --- docs/api/admin.yaml | 2 ++ docs/api/paths/admin/_debug-stack_trace.yaml | 24 ++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 docs/api/paths/admin/_debug-stack_trace.yaml diff --git a/docs/api/admin.yaml b/docs/api/admin.yaml index 6d76d942ba..56fc71626d 100644 --- a/docs/api/admin.yaml +++ b/docs/api/admin.yaml @@ -97,6 +97,8 @@ paths: $ref: ./paths/admin/_debug-pprof-trace.yaml /_debug/fgprof: $ref: ./paths/admin/_debug-fgprof.yaml + /_debug/stacktrace: + $ref: ./paths/admin/_debug-stack_trace.yaml /_post_upgrade: $ref: ./paths/admin/_post_upgrade.yaml '/{db}/_config': diff --git a/docs/api/paths/admin/_debug-stack_trace.yaml b/docs/api/paths/admin/_debug-stack_trace.yaml new file mode 100644 index 0000000000..1a6df60177 --- /dev/null +++ b/docs/api/paths/admin/_debug-stack_trace.yaml @@ -0,0 +1,24 @@ +# Copyright 2025-Present Couchbase, Inc. +# +# Use of this software is governed by the Business Source License included +# in the file licenses/BSL-Couchbase.txt. As of the Change Date specified +# in that file, in accordance with the Business Source License, use of this +# software will be governed by the Apache License, Version 2.0, included in +# the file licenses/APL2.txt. +get: + summary: Get stack trace for all goroutines + description: |- + Returns stack traces of all running goroutines in Sync Gateway. + + Required Sync Gateway RBAC roles: + + * Sync Gateway Dev Ops + responses: + '200': + description: OK + content: + application/json: + schema: + type: string + tags: + - Profiling From ac7795c18c71464ed7d517cdcf3f5eb71cec1307 Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 12:36:36 +0000 Subject: [PATCH 07/13] windows incompatability + api docs change --- docs/api/paths/admin/_debug-stack_trace.yaml | 1 + rest/server_context.go | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/api/paths/admin/_debug-stack_trace.yaml b/docs/api/paths/admin/_debug-stack_trace.yaml index 1a6df60177..e6591b841b 100644 --- a/docs/api/paths/admin/_debug-stack_trace.yaml +++ b/docs/api/paths/admin/_debug-stack_trace.yaml @@ -22,3 +22,4 @@ get: type: string tags: - Profiling + operationId: get__debug-stacktrace diff --git a/rest/server_context.go b/rest/server_context.go index cf882c3575..594b428131 100644 --- a/rest/server_context.go +++ b/rest/server_context.go @@ -22,6 +22,7 @@ import ( "os" "os/signal" "path/filepath" + "runtime" "slices" "sort" "strconv" @@ -210,9 +211,11 @@ func NewServerContext(ctx context.Context, config *StartupConfig, persistentConf sc.startStatsLogger(ctx) - signalCtx, cancelFunc := context.WithCancel(ctx) - sc.signalContextFunc = cancelFunc - sc.registerSignalHandlerForStackTrace(signalCtx) + if runtime.GOOS != "windows" { + signalCtx, cancelFunc := context.WithCancel(ctx) + sc.signalContextFunc = cancelFunc + sc.registerSignalHandlerForStackTrace(signalCtx) + } return sc } From 201310afdbbafb68cbb18b5ab3a8e68047bf6566 Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 13:16:47 +0000 Subject: [PATCH 08/13] build issues --- rest/server_context.go | 30 ---------------- rest/stack_trace_handler_uinx.go | 54 +++++++++++++++++++++++++++++ rest/stack_trace_handler_windows.go | 44 +++++++++++++++++++++++ 3 files changed, 98 insertions(+), 30 deletions(-) create mode 100644 rest/stack_trace_handler_uinx.go create mode 100644 rest/stack_trace_handler_windows.go diff --git a/rest/server_context.go b/rest/server_context.go index 594b428131..12579dcb8d 100644 --- a/rest/server_context.go +++ b/rest/server_context.go @@ -220,36 +220,6 @@ func NewServerContext(ctx context.Context, config *StartupConfig, persistentConf return sc } -// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces -// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines. -func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) { - signalChannel := make(chan os.Signal, 1) - signal.Notify(signalChannel, syscall.SIGUSR1) - - defer func() { - signal.Stop(signalChannel) - close(signalChannel) - }() - - go func() { - select { - case sig := <-signalChannel: - base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) - switch sig { - case syscall.SIGUSR1: - // stack trace signal received - currentTime := time.Now() - timestamp := currentTime.Format(time.RFC3339) - sc.logStackTraces(ctx, timestamp) - default: - // unhandled signal here - } - case <-ctx.Done(): - return - } - }() -} - func (sc *ServerContext) WaitForRESTAPIs(ctx context.Context) error { timeout := 30 * time.Second interval := time.Millisecond * 100 diff --git a/rest/stack_trace_handler_uinx.go b/rest/stack_trace_handler_uinx.go new file mode 100644 index 0000000000..e15421d8d8 --- /dev/null +++ b/rest/stack_trace_handler_uinx.go @@ -0,0 +1,54 @@ +//go:build !windows +// +build !windows + +/* +Copyright 2025-Present Couchbase, Inc. + +Use of this software is governed by the Business Source License included in +the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that +file, in accordance with the Business Source License, use of this software will +be governed by the Apache License, Version 2.0, included in the file +licenses/APL2.txt. +*/ + +package rest + +import ( + "context" + "os" + "os/signal" + "syscall" + "time" + + "github.com/couchbase/sync_gateway/base" +) + +// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces +// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines. +func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) { + signalChannel := make(chan os.Signal, 1) + signal.Notify(signalChannel, syscall.SIGUSR1) + + defer func() { + signal.Stop(signalChannel) + close(signalChannel) + }() + + go func() { + select { + case sig := <-signalChannel: + base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) + switch sig { + case syscall.SIGUSR1: + // stack trace signal received + currentTime := time.Now() + timestamp := currentTime.Format(time.RFC3339) + sc.logStackTraces(ctx, timestamp) + default: + // unhandled signal here + } + case <-ctx.Done(): + return + } + }() +} diff --git a/rest/stack_trace_handler_windows.go b/rest/stack_trace_handler_windows.go new file mode 100644 index 0000000000..2db6bfbfe6 --- /dev/null +++ b/rest/stack_trace_handler_windows.go @@ -0,0 +1,44 @@ +//go:build windows +// +build windows + +/* +Copyright 2025-Present Couchbase, Inc. + +Use of this software is governed by the Business Source License included in +the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that +file, in accordance with the Business Source License, use of this software will +be governed by the Apache License, Version 2.0, included in the file +licenses/APL2.txt. +*/ + +package rest + +// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces +// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines. +func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) { + signalChannel := make(chan os.Signal, 1) + signal.Notify(signalChannel, syscall.SIGUSR1) + + defer func() { + signal.Stop(signalChannel) + close(signalChannel) + }() + + go func() { + select { + case sig := <-signalChannel: + base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) + switch sig { + case syscall.SIGUSR1: + // stack trace signal received + currentTime := time.Now() + timestamp := currentTime.Format(time.RFC3339) + sc.logStackTraces(ctx, timestamp) + default: + // unhandled signal here + } + case <-ctx.Done(): + return + } + }() +} From 8207199d5c636a1541f46a5cf1df4e47cec67b94 Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Mon, 10 Nov 2025 13:19:16 +0000 Subject: [PATCH 09/13] foratting + mistake with windows handler --- rest/server_context.go | 2 -- rest/stack_trace_handler_windows.go | 26 +------------------------- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/rest/server_context.go b/rest/server_context.go index 12579dcb8d..d8d98bedfc 100644 --- a/rest/server_context.go +++ b/rest/server_context.go @@ -20,7 +20,6 @@ import ( "net" "net/http" "os" - "os/signal" "path/filepath" "runtime" "slices" @@ -29,7 +28,6 @@ import ( "strings" "sync" "sync/atomic" - "syscall" "time" "github.com/KimMachineGun/automemlimit/memlimit" diff --git a/rest/stack_trace_handler_windows.go b/rest/stack_trace_handler_windows.go index 2db6bfbfe6..60358946e3 100644 --- a/rest/stack_trace_handler_windows.go +++ b/rest/stack_trace_handler_windows.go @@ -16,29 +16,5 @@ package rest // registerSignalHandlerForStackTrace will register a signal handler to capture stack traces // - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines. func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) { - signalChannel := make(chan os.Signal, 1) - signal.Notify(signalChannel, syscall.SIGUSR1) - - defer func() { - signal.Stop(signalChannel) - close(signalChannel) - }() - - go func() { - select { - case sig := <-signalChannel: - base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) - switch sig { - case syscall.SIGUSR1: - // stack trace signal received - currentTime := time.Now() - timestamp := currentTime.Format(time.RFC3339) - sc.logStackTraces(ctx, timestamp) - default: - // unhandled signal here - } - case <-ctx.Done(): - return - } - }() + // No-op on Windows } From e54a721c7374ba5d7bac72d11f2385d94aaed8b7 Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Tue, 11 Nov 2025 10:44:42 +0000 Subject: [PATCH 10/13] updates based off review --- base/constants.go | 3 + base/util.go | 47 ++++++++++----- docs/api/admin.yaml | 2 - docs/api/paths/admin/_debug-stack_trace.yaml | 25 -------- rest/adminapitest/admin_api_test.go | 4 +- rest/api.go | 8 --- rest/config.go | 25 -------- rest/main.go | 10 +++- rest/register_handler_unix.go | 60 ++++++++++++++++++++ rest/register_handler_windows.go | 47 +++++++++++++++ rest/routing.go | 2 - rest/server_context.go | 46 --------------- rest/server_context_test.go | 12 ++-- rest/stack_trace_handler_uinx.go | 54 ------------------ rest/stack_trace_handler_windows.go | 20 ------- tools/sgcollect.py | 2 +- 16 files changed, 164 insertions(+), 203 deletions(-) delete mode 100644 docs/api/paths/admin/_debug-stack_trace.yaml create mode 100644 rest/register_handler_unix.go create mode 100644 rest/register_handler_windows.go delete mode 100644 rest/stack_trace_handler_uinx.go delete mode 100644 rest/stack_trace_handler_windows.go diff --git a/base/constants.go b/base/constants.go index 324669b67d..22af3b5750 100644 --- a/base/constants.go +++ b/base/constants.go @@ -155,6 +155,9 @@ const ( // FromConnStrWarningThreshold determines the amount of time it should take before we warn about parsing a connstr (mostly for DNS resolution) FromConnStrWarningThreshold = 10 * time.Second + + // StackFilePrefix is the prefix used when writing stack trace files + StackFilePrefix = "sg_stack_trace_" ) // SyncGatewayRawDocXattrs is a list of xattrs that Sync Gateway will fetch when reading a raw document. diff --git a/base/util.go b/base/util.go index 49b5d7b944..74ceb56547 100644 --- a/base/util.go +++ b/base/util.go @@ -33,6 +33,7 @@ import ( "regexp" "runtime" "runtime/debug" + "runtime/pprof" "slices" "sort" "strconv" @@ -1835,19 +1836,10 @@ func IsRevTreeID(s string) bool { } // GetStackTrace will return goroutine stack traces for all goroutines in Sync Gateway. -func GetStackTrace() string { - // make 1MB buffer but if this buffer isn't big enough, runtime.Stack will - // return nothing, thus have 5 retires doubling the capacity each time. - buf := make([]byte, 1<<20) - for range 5 { - n := runtime.Stack(buf, true) - if n < len(buf) { - buf = buf[:n] - break - } - buf = make([]byte, 2*len(buf)) - } - return string(buf) +func GetStackTrace() (bytes.Buffer, error) { + profBuf := bytes.Buffer{} + err := pprof.Lookup("goroutine").WriteTo(&profBuf, 2) + return profBuf, err } // RotateProfilesIfNeeded will remove old files if there are more than @@ -1870,3 +1862,32 @@ func RotateProfilesIfNeeded(filename string) error { } return multiErr.ErrorOrNil() } + +func LogStackTraces(ctx context.Context, logDirectory string, stackTrace bytes.Buffer, timestamp string) { + + // log to console + _, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace.String()) + + filename := filepath.Join(logDirectory, StackFilePrefix+timestamp+".log") + file, err := os.Create(filename) + defer func() { + closeErr := file.Close() + if closeErr != nil { + WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, closeErr) + } + }() + if err != nil { + WarnfCtx(ctx, "Error opening stack trace file %s: %v", filename, err) + } + + _, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace.String())) + if err != nil { + WarnfCtx(ctx, "Error writing stack trace to file %s: %v", filename, err) + } + + rotatePath := filepath.Join(logDirectory, StackFilePrefix+"*.log") + err = RotateProfilesIfNeeded(rotatePath) + if err != nil { + WarnfCtx(ctx, "Error rotating stack trace files in path %s: %v", rotatePath, err) + } +} diff --git a/docs/api/admin.yaml b/docs/api/admin.yaml index 56fc71626d..6d76d942ba 100644 --- a/docs/api/admin.yaml +++ b/docs/api/admin.yaml @@ -97,8 +97,6 @@ paths: $ref: ./paths/admin/_debug-pprof-trace.yaml /_debug/fgprof: $ref: ./paths/admin/_debug-fgprof.yaml - /_debug/stacktrace: - $ref: ./paths/admin/_debug-stack_trace.yaml /_post_upgrade: $ref: ./paths/admin/_post_upgrade.yaml '/{db}/_config': diff --git a/docs/api/paths/admin/_debug-stack_trace.yaml b/docs/api/paths/admin/_debug-stack_trace.yaml deleted file mode 100644 index e6591b841b..0000000000 --- a/docs/api/paths/admin/_debug-stack_trace.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2025-Present Couchbase, Inc. -# -# Use of this software is governed by the Business Source License included -# in the file licenses/BSL-Couchbase.txt. As of the Change Date specified -# in that file, in accordance with the Business Source License, use of this -# software will be governed by the Apache License, Version 2.0, included in -# the file licenses/APL2.txt. -get: - summary: Get stack trace for all goroutines - description: |- - Returns stack traces of all running goroutines in Sync Gateway. - - Required Sync Gateway RBAC roles: - - * Sync Gateway Dev Ops - responses: - '200': - description: OK - content: - application/json: - schema: - type: string - tags: - - Profiling - operationId: get__debug-stacktrace diff --git a/rest/adminapitest/admin_api_test.go b/rest/adminapitest/admin_api_test.go index 1c41fe7a1e..0078a826fc 100644 --- a/rest/adminapitest/admin_api_test.go +++ b/rest/adminapitest/admin_api_test.go @@ -2273,11 +2273,11 @@ func TestHandleGetStackTrace(t *testing.T) { rt := rest.NewRestTester(t, nil) defer rt.Close() - resp := rt.SendAdminRequest(http.MethodGet, "/_debug/stacktrace", "") + resp := rt.SendAdminRequest(http.MethodGet, "/_debug/pprof/goroutine?debug=2", "") rest.RequireStatus(t, resp, http.StatusOK) rawResponseStr := resp.Body.String() assert.Contains(t, rawResponseStr, "goroutine") - assert.Contains(t, rawResponseStr, "handleCollectStackTrace") + assert.Contains(t, rawResponseStr, "handlePprofGoroutine") } func TestConfigRedaction(t *testing.T) { diff --git a/rest/api.go b/rest/api.go index 0adb2cff3d..7a589d7de1 100644 --- a/rest/api.go +++ b/rest/api.go @@ -693,14 +693,6 @@ func (h *handler) handleFgprof() error { return stopFn() } -func (h *handler) handleCollectStackTrace() error { - - stackTrace := base.GetStackTrace() - - h.writeText([]byte(stackTrace)) - return nil -} - func (h *handler) handlePprofBlock() error { sec, err := strconv.ParseInt(h.rq.FormValue("seconds"), 10, 64) if sec <= 0 || err != nil { diff --git a/rest/config.go b/rest/config.go index 92749fc22f..36682771b7 100644 --- a/rest/config.go +++ b/rest/config.go @@ -22,12 +22,10 @@ import ( _ "net/http/pprof" "net/url" "os" - "os/signal" "strconv" "strings" "sync" "sync/atomic" - "syscall" "time" "github.com/go-jose/go-jose/v4" @@ -2315,29 +2313,6 @@ func HandleSighup(ctx context.Context) { base.RotateLogfiles(ctx) } -// RegisterSignalHandler invokes functions based on the given signals: -// - SIGHUP causes Sync Gateway to rotate log files. -// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly. -// - SIGKILL cannot be handled by the application. -func RegisterSignalHandler(ctx context.Context) { - signalChannel := make(chan os.Signal, 1) - signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM) - - go func() { - for sig := range signalChannel { - base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) - switch sig { - case syscall.SIGHUP: - HandleSighup(ctx) - default: - // Ensure log buffers are flushed before exiting. - base.FlushLogBuffers() - os.Exit(130) // 130 == exit code 128 + 2 (interrupt) - } - } - }() -} - // toDbLogConfig converts the stored logging in a DbConfig to a runtime DbLogConfig for evaluation at log time. // This is required to turn the stored config (which does not have data stored in a O(1)-compatible format) into a data structure that has O(1) lookups for checking if we should log. func (c *DbConfig) toDbLogConfig(ctx context.Context) *base.DbLogConfig { diff --git a/rest/main.go b/rest/main.go index 3fe7f3c812..f79243e06d 100644 --- a/rest/main.go +++ b/rest/main.go @@ -16,7 +16,9 @@ import ( "fmt" "io" "os" + "os/signal" "path/filepath" + "runtime" "strings" "time" @@ -37,7 +39,7 @@ func ServerMain() { // TODO: Pass ctx down into HTTP servers so that serverMain can be stopped. func serverMain(ctx context.Context, osArgs []string) error { - RegisterSignalHandler(ctx) + sigChan := RegisterSignalHandler(ctx, "") defer base.FatalPanicHandler() base.InitializeMemoryLoggers() @@ -51,6 +53,12 @@ func serverMain(ctx context.Context, osArgs []string) error { } return err } + if runtime.GOOS != "windows" { + // stop signal handlers and register for stack trace handling, this is not supported for windows environments + signal.Stop(sigChan) + close(sigChan) + RegisterSignalHandler(ctx, flagStartupConfig.Logging.LogFilePath) + } if *disablePersistentConfig { return legacyServerMain(ctx, osArgs, flagStartupConfig) diff --git a/rest/register_handler_unix.go b/rest/register_handler_unix.go new file mode 100644 index 0000000000..d87fa1f99c --- /dev/null +++ b/rest/register_handler_unix.go @@ -0,0 +1,60 @@ +//go:build !windows +// +build !windows + +/* +Copyright 2025-Present Couchbase, Inc. + +Use of this software is governed by the Business Source License included in +the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that +file, in accordance with the Business Source License, use of this software will +be governed by the Apache License, Version 2.0, included in the file +licenses/APL2.txt. +*/ + +package rest + +import ( + "context" + "os" + "os/signal" + "syscall" + "time" + + "github.com/couchbase/sync_gateway/base" +) + +// RegisterSignalHandler invokes functions based on the given signals for unix environments: +// - SIGHUP causes Sync Gateway to rotate log files. +// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly. +// - SIGKILL cannot be handled by the application. +// - SIGUSR1 causes Sync Gateway to log stack traces for all goroutines. +func RegisterSignalHandler(ctx context.Context, logDirectory string) chan os.Signal { + signalChannel := make(chan os.Signal, 1) + signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM, syscall.SIGUSR1) + + go func() { + for sig := range signalChannel { + base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) + switch sig { + case syscall.SIGHUP: + HandleSighup(ctx) + case syscall.SIGUSR1: + stackTrace, err := base.GetStackTrace() + if err != nil { + base.WarnfCtx(ctx, "Error collecting stack trace: %v", err) + } else { + base.InfofCtx(ctx, base.KeyAll, "Collecting stack trace for all goroutines") + } + // log to console and log to file in the log directory + currentTime := time.Now() + timestamp := currentTime.Format(time.RFC3339) + base.LogStackTraces(ctx, logDirectory, stackTrace, timestamp) + default: + // Ensure log buffers are flushed before exiting. + base.FlushLogBuffers() + os.Exit(130) // 130 == exit code 128 + 2 (interrupt) + } + } + }() + return signalChannel +} diff --git a/rest/register_handler_windows.go b/rest/register_handler_windows.go new file mode 100644 index 0000000000..9992a59ea0 --- /dev/null +++ b/rest/register_handler_windows.go @@ -0,0 +1,47 @@ +//go:build windows +// +build windows + +/* +Copyright 2025-Present Couchbase, Inc. + +Use of this software is governed by the Business Source License included in +the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that +file, in accordance with the Business Source License, use of this software will +be governed by the Apache License, Version 2.0, included in the file +licenses/APL2.txt. +*/ + +package rest + +import ( + "context" + "os" + "os/signal" + "syscall" + + "github.com/couchbase/sync_gateway/base" +) + +// RegisterSignalHandler invokes functions based on the given signals for windows environments: +// - SIGHUP causes Sync Gateway to rotate log files. +// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly. +// - SIGKILL cannot be handled by the application. +func RegisterSignalHandler(ctx context.Context, logDirectory string) chan os.Signal { + signalChannel := make(chan os.Signal, 1) + signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM) + + go func() { + for sig := range signalChannel { + base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) + switch sig { + case syscall.SIGHUP: + HandleSighup(ctx) + default: + // Ensure log buffers are flushed before exiting. + base.FlushLogBuffers() + os.Exit(130) // 130 == exit code 128 + 2 (interrupt) + } + } + }() + return signalChannel +} diff --git a/rest/routing.go b/rest/routing.go index 2458a18b5a..6504ddcc74 100644 --- a/rest/routing.go +++ b/rest/routing.go @@ -316,8 +316,6 @@ func CreateAdminRouter(sc *ServerContext) *mux.Router { makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePprofTrace, handlerOptions{sgcollect: true})).Methods("GET", "POST") r.Handle("/_debug/fgprof", makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleFgprof, handlerOptions{sgcollect: true})).Methods("GET", "POST") - r.Handle("/_debug/stacktrace", - makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleCollectStackTrace, handlerOptions{sgcollect: true})).Methods("GET") r.Handle("/_post_upgrade", makeHandler(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePostUpgrade)).Methods("POST") diff --git a/rest/server_context.go b/rest/server_context.go index d8d98bedfc..96acc45b77 100644 --- a/rest/server_context.go +++ b/rest/server_context.go @@ -20,8 +20,6 @@ import ( "net" "net/http" "os" - "path/filepath" - "runtime" "slices" "sort" "strconv" @@ -61,8 +59,6 @@ var ErrSuspendingDisallowed = errors.New("database does not allow suspending") var allServers = []serverType{publicServer, adminServer, metricsServer, diagnosticServer} -const stackFilePrefix = "sg_stack_trace_" - // serverInfo represents an instance of an HTTP server from sync gateway type serverInfo struct { server *http.Server // server is the HTTP server instance @@ -97,7 +93,6 @@ type ServerContext struct { DatabaseInitManager *DatabaseInitManager // Manages database initialization (index creation and readiness) independent of database stop/start/reload, when using persistent config ActiveReplicationsCounter invalidDatabaseConfigTracking invalidDatabaseConfigs - signalContextFunc context.CancelFunc // handle sgcollect processes for a given Server SGCollect *sgCollect } @@ -209,12 +204,6 @@ func NewServerContext(ctx context.Context, config *StartupConfig, persistentConf sc.startStatsLogger(ctx) - if runtime.GOOS != "windows" { - signalCtx, cancelFunc := context.WithCancel(ctx) - sc.signalContextFunc = cancelFunc - sc.registerSignalHandlerForStackTrace(signalCtx) - } - return sc } @@ -286,9 +275,6 @@ func (sc *ServerContext) Close(ctx context.Context) { base.InfofCtx(ctx, base.KeyAll, "Couldn't stop background config update worker: %v", err) } - // cancel any signal handlers - sc.signalContextFunc() - sc.lock.Lock() defer sc.lock.Unlock() @@ -1858,38 +1844,6 @@ func (sc *ServerContext) logStats(ctx context.Context) error { } -func (sc *ServerContext) logStackTraces(ctx context.Context, timestamp string) { - - base.InfofCtx(ctx, base.KeyAll, "Collecting stack trace for all goroutines") - stackTrace := base.GetStackTrace() - - // log to console - _, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace) - - filename := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+timestamp+".log") - file, err := os.Create(filename) - defer func() { - closeErr := file.Close() - if closeErr != nil { - base.WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, closeErr) - } - }() - if err != nil { - base.WarnfCtx(ctx, "Error opening stack trace file %s: %v", filename, err) - } - - _, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace)) - if err != nil { - base.WarnfCtx(ctx, "Error writing stack trace to file %s: %v", filename, err) - } - - rotatePath := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+"*.log") - err = base.RotateProfilesIfNeeded(rotatePath) - if err != nil { - base.WarnfCtx(ctx, "Error rotating stack trace files in path %s: %v", rotatePath, err) - } -} - func (sc *ServerContext) logNetworkInterfaceStats(ctx context.Context) { if err := sc.statsContext.addPublicNetworkInterfaceStatsForHostnamePort(sc.Config.API.PublicInterface); err != nil { diff --git a/rest/server_context_test.go b/rest/server_context_test.go index 4e3ae1fef1..5c78ac614d 100644 --- a/rest/server_context_test.go +++ b/rest/server_context_test.go @@ -1075,16 +1075,20 @@ func TestCollectStackTraceFile(t *testing.T) { defer serverContext.Close(ctx) timeStamp := "01" - serverContext.logStackTraces(ctx, timeStamp) + stackTrace, err := base.GetStackTrace() + require.NoError(t, err) + base.LogStackTraces(ctx, serverConfig.Logging.LogFilePath, stackTrace, timeStamp) require.Len(t, getFilenames(t, tempPath), 1) - assert.True(t, slices.Contains(getFilenames(t, tempPath), stackFilePrefix+timeStamp+".log")) + assert.True(t, slices.Contains(getFilenames(t, tempPath), base.StackFilePrefix+timeStamp+".log")) // trigger rotation and assert we don't go above 10 files expectedFiles := make([]string, 0, 10) for i := 2; i < 12; i++ { timeStamp = fmt.Sprintf("%d", i+2) - serverContext.logStackTraces(ctx, timeStamp) - expectedFiles = append(expectedFiles, stackFilePrefix+timeStamp+".log") + trace, err := base.GetStackTrace() + require.NoError(t, err) + base.LogStackTraces(ctx, serverConfig.Logging.LogFilePath, trace, timeStamp) + expectedFiles = append(expectedFiles, base.StackFilePrefix+timeStamp+".log") } files := getFilenames(t, tempPath) require.Len(t, files, 10) diff --git a/rest/stack_trace_handler_uinx.go b/rest/stack_trace_handler_uinx.go deleted file mode 100644 index e15421d8d8..0000000000 --- a/rest/stack_trace_handler_uinx.go +++ /dev/null @@ -1,54 +0,0 @@ -//go:build !windows -// +build !windows - -/* -Copyright 2025-Present Couchbase, Inc. - -Use of this software is governed by the Business Source License included in -the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that -file, in accordance with the Business Source License, use of this software will -be governed by the Apache License, Version 2.0, included in the file -licenses/APL2.txt. -*/ - -package rest - -import ( - "context" - "os" - "os/signal" - "syscall" - "time" - - "github.com/couchbase/sync_gateway/base" -) - -// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces -// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines. -func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) { - signalChannel := make(chan os.Signal, 1) - signal.Notify(signalChannel, syscall.SIGUSR1) - - defer func() { - signal.Stop(signalChannel) - close(signalChannel) - }() - - go func() { - select { - case sig := <-signalChannel: - base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) - switch sig { - case syscall.SIGUSR1: - // stack trace signal received - currentTime := time.Now() - timestamp := currentTime.Format(time.RFC3339) - sc.logStackTraces(ctx, timestamp) - default: - // unhandled signal here - } - case <-ctx.Done(): - return - } - }() -} diff --git a/rest/stack_trace_handler_windows.go b/rest/stack_trace_handler_windows.go deleted file mode 100644 index 60358946e3..0000000000 --- a/rest/stack_trace_handler_windows.go +++ /dev/null @@ -1,20 +0,0 @@ -//go:build windows -// +build windows - -/* -Copyright 2025-Present Couchbase, Inc. - -Use of this software is governed by the Business Source License included in -the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that -file, in accordance with the Business Source License, use of this software will -be governed by the Apache License, Version 2.0, included in the file -licenses/APL2.txt. -*/ - -package rest - -// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces -// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines. -func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) { - // No-op on Windows -} diff --git a/tools/sgcollect.py b/tools/sgcollect.py index 649bdb39ed..d079530917 100755 --- a/tools/sgcollect.py +++ b/tools/sgcollect.py @@ -285,7 +285,7 @@ def make_http_client_stack_trace_task( """ This task uses the python http client to collect the raw stack trace data """ - stack_trace_url = "{0}/_debug/stacktrace".format(sg_url) + stack_trace_url = "{0}/_debug/pprof/goroutine?debug=2".format(sg_url) stack_trace_task = make_curl_task( name="Collect stack trace via http client", From 2c1bfebed9fce334916f8bd22cf5f21b8f8d1008 Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Tue, 11 Nov 2025 11:20:25 +0000 Subject: [PATCH 11/13] tweaks for logging directory path --- rest/main.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/rest/main.go b/rest/main.go index f79243e06d..d8e0fae77d 100644 --- a/rest/main.go +++ b/rest/main.go @@ -53,18 +53,12 @@ func serverMain(ctx context.Context, osArgs []string) error { } return err } - if runtime.GOOS != "windows" { - // stop signal handlers and register for stack trace handling, this is not supported for windows environments - signal.Stop(sigChan) - close(sigChan) - RegisterSignalHandler(ctx, flagStartupConfig.Logging.LogFilePath) - } if *disablePersistentConfig { return legacyServerMain(ctx, osArgs, flagStartupConfig) } - disablePersistentConfigFallback, err := serverMainPersistentConfig(ctx, fs, flagStartupConfig) + disablePersistentConfigFallback, err := serverMainPersistentConfig(ctx, fs, flagStartupConfig, sigChan) if disablePersistentConfigFallback { return legacyServerMain(ctx, osArgs, flagStartupConfig) } @@ -73,7 +67,7 @@ func serverMain(ctx context.Context, osArgs []string) error { } // serverMainPersistentConfig runs the Sync Gateway server with persistent config. -func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStartupConfig *StartupConfig) (disablePersistentConfigFallback bool, err error) { +func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStartupConfig *StartupConfig, sigChan chan os.Signal) (disablePersistentConfigFallback bool, err error) { sc := DefaultStartupConfig(defaultLogFilePath) base.TracefCtx(ctx, base.KeyAll, "default config: %#v", sc) @@ -156,6 +150,14 @@ func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStart return false, err } + if runtime.GOOS != "windows" { + // stop signal handlers and register for stack trace handling to be able to log to configured directory, + // this is not supported for windows environments + signal.Stop(sigChan) + close(sigChan) + RegisterSignalHandler(ctx, svrctx.Config.Logging.LogFilePath) + } + svrctx.initialStartupConfig = initialStartupConfig svrctx.addLegacyPrincipals(ctx, legacyDbUsers, legacyDbRoles) From 027cd4f95481cd46a16fa121db947bf7657e1213 Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Thu, 13 Nov 2025 14:48:37 +0000 Subject: [PATCH 12/13] address review --- base/constants.go | 2 +- base/util.go | 26 ++++++++++++++++++-------- rest/stats_context.go | 2 +- tools-tests/sgcollect_info_test.py | 2 +- tools/sgcollect.py | 4 ++-- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/base/constants.go b/base/constants.go index 22af3b5750..f4dda65d83 100644 --- a/base/constants.go +++ b/base/constants.go @@ -157,7 +157,7 @@ const ( FromConnStrWarningThreshold = 10 * time.Second // StackFilePrefix is the prefix used when writing stack trace files - StackFilePrefix = "sg_stack_trace_" + StackFilePrefix = "sg_goroutines_" ) // SyncGatewayRawDocXattrs is a list of xattrs that Sync Gateway will fetch when reading a raw document. diff --git a/base/util.go b/base/util.go index 74ceb56547..e21855e150 100644 --- a/base/util.go +++ b/base/util.go @@ -1842,9 +1842,9 @@ func GetStackTrace() (bytes.Buffer, error) { return profBuf, err } -// RotateProfilesIfNeeded will remove old files if there are more than +// RotateFilenamesIfNeeded will remove old files if there are more than // 10 matching the given filename pattern. -func RotateProfilesIfNeeded(filename string) error { +func RotateFilenamesIfNeeded(filename string) error { existingFiles, err := filepath.Glob(filename) if err != nil { return fmt.Errorf("Error listing existing profiles in %q: %w", filename, err) @@ -1868,6 +1868,19 @@ func LogStackTraces(ctx context.Context, logDirectory string, stackTrace bytes.B // log to console _, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace.String()) + err := writeStackTraceFile(ctx, logDirectory, timestamp, stackTrace) + if err != nil { + return + } + + rotatePath := filepath.Join(logDirectory, StackFilePrefix+"*.log") + err = RotateFilenamesIfNeeded(rotatePath) + if err != nil { + WarnfCtx(ctx, "Error rotating stack trace files in path %s: %v", rotatePath, err) + } +} + +func writeStackTraceFile(ctx context.Context, logDirectory, timestamp string, stackTrace bytes.Buffer) error { filename := filepath.Join(logDirectory, StackFilePrefix+timestamp+".log") file, err := os.Create(filename) defer func() { @@ -1878,16 +1891,13 @@ func LogStackTraces(ctx context.Context, logDirectory string, stackTrace bytes.B }() if err != nil { WarnfCtx(ctx, "Error opening stack trace file %s: %v", filename, err) + return err } _, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace.String())) if err != nil { WarnfCtx(ctx, "Error writing stack trace to file %s: %v", filename, err) + return err } - - rotatePath := filepath.Join(logDirectory, StackFilePrefix+"*.log") - err = RotateProfilesIfNeeded(rotatePath) - if err != nil { - WarnfCtx(ctx, "Error rotating stack trace files in path %s: %v", rotatePath, err) - } + return nil } diff --git a/rest/stats_context.go b/rest/stats_context.go index 4ec835b7f4..6b8e5b65fd 100644 --- a/rest/stats_context.go +++ b/rest/stats_context.go @@ -409,5 +409,5 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp // rotate old profiles path := filepath.Join(outputDir, pprofPrefix+"*.pb.gz") - return base.RotateProfilesIfNeeded(path) + return base.RotateFilenamesIfNeeded(path) } diff --git a/tools-tests/sgcollect_info_test.py b/tools-tests/sgcollect_info_test.py index 6a5b2503e0..11de366f7d 100644 --- a/tools-tests/sgcollect_info_test.py +++ b/tools-tests/sgcollect_info_test.py @@ -81,7 +81,7 @@ def test_make_collect_logs_stacktrace(tmpdir): ).encode("utf-8") ), ): - stacktrace_file = tmpdir.join("sg_stack_trace.log") + stacktrace_file = tmpdir.join("sg_goroutines.log") stacktrace_file.write("foo") tasks = sgcollect.make_collect_logs_tasks( sg_url="fakeurl", diff --git a/tools/sgcollect.py b/tools/sgcollect.py index d079530917..66d92fdd19 100755 --- a/tools/sgcollect.py +++ b/tools/sgcollect.py @@ -291,7 +291,7 @@ def make_http_client_stack_trace_task( name="Collect stack trace via http client", auth_headers=auth_headers, url=stack_trace_url, - log_file="sg_stack_trace.log", + log_file="sg_goroutines.log", ) stack_trace_task.no_header = True @@ -386,7 +386,7 @@ def make_collect_logs_tasks( "sg_debug.log": "sg_debug.log", "sg_trace.log": "sg_trace.log", "sg_stats.log": "sg_stats.log", - "sg_stack_trace.log": "sg_stack_trace.log", + "sg_goroutines.log": "sg_goroutines.log", "sync_gateway_access.log": "sync_gateway_access.log", "sync_gateway_error.log": "sync_gateway_error.log", "pprof.pb": "pprof.pb", From 085e4118ddaed26a06e8e77aa2ad375500faf26c Mon Sep 17 00:00:00 2001 From: Gregory Newman-Smith Date: Thu, 13 Nov 2025 17:12:09 +0000 Subject: [PATCH 13/13] rename files --- base/constants.go | 2 +- tools-tests/sgcollect_info_test.py | 2 +- tools/sgcollect.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/base/constants.go b/base/constants.go index f4dda65d83..22af3b5750 100644 --- a/base/constants.go +++ b/base/constants.go @@ -157,7 +157,7 @@ const ( FromConnStrWarningThreshold = 10 * time.Second // StackFilePrefix is the prefix used when writing stack trace files - StackFilePrefix = "sg_goroutines_" + StackFilePrefix = "sg_stack_trace_" ) // SyncGatewayRawDocXattrs is a list of xattrs that Sync Gateway will fetch when reading a raw document. diff --git a/tools-tests/sgcollect_info_test.py b/tools-tests/sgcollect_info_test.py index 11de366f7d..6a5b2503e0 100644 --- a/tools-tests/sgcollect_info_test.py +++ b/tools-tests/sgcollect_info_test.py @@ -81,7 +81,7 @@ def test_make_collect_logs_stacktrace(tmpdir): ).encode("utf-8") ), ): - stacktrace_file = tmpdir.join("sg_goroutines.log") + stacktrace_file = tmpdir.join("sg_stack_trace.log") stacktrace_file.write("foo") tasks = sgcollect.make_collect_logs_tasks( sg_url="fakeurl", diff --git a/tools/sgcollect.py b/tools/sgcollect.py index 66d92fdd19..d079530917 100755 --- a/tools/sgcollect.py +++ b/tools/sgcollect.py @@ -291,7 +291,7 @@ def make_http_client_stack_trace_task( name="Collect stack trace via http client", auth_headers=auth_headers, url=stack_trace_url, - log_file="sg_goroutines.log", + log_file="sg_stack_trace.log", ) stack_trace_task.no_header = True @@ -386,7 +386,7 @@ def make_collect_logs_tasks( "sg_debug.log": "sg_debug.log", "sg_trace.log": "sg_trace.log", "sg_stats.log": "sg_stats.log", - "sg_goroutines.log": "sg_goroutines.log", + "sg_stack_trace.log": "sg_stack_trace.log", "sync_gateway_access.log": "sync_gateway_access.log", "sync_gateway_error.log": "sync_gateway_error.log", "pprof.pb": "pprof.pb",