diff --git a/base/constants.go b/base/constants.go index 324669b67d..22af3b5750 100644 --- a/base/constants.go +++ b/base/constants.go @@ -155,6 +155,9 @@ const ( // FromConnStrWarningThreshold determines the amount of time it should take before we warn about parsing a connstr (mostly for DNS resolution) FromConnStrWarningThreshold = 10 * time.Second + + // StackFilePrefix is the prefix used when writing stack trace files + StackFilePrefix = "sg_stack_trace_" ) // SyncGatewayRawDocXattrs is a list of xattrs that Sync Gateway will fetch when reading a raw document. diff --git a/base/util.go b/base/util.go index 1b6daa9553..e21855e150 100644 --- a/base/util.go +++ b/base/util.go @@ -27,10 +27,13 @@ import ( "net" "net/http" "net/url" + "os" + "path/filepath" "reflect" "regexp" "runtime" "runtime/debug" + "runtime/pprof" "slices" "sort" "strconv" @@ -1831,3 +1834,70 @@ func IsRevTreeID(s string) bool { } return false } + +// GetStackTrace will return goroutine stack traces for all goroutines in Sync Gateway. +func GetStackTrace() (bytes.Buffer, error) { + profBuf := bytes.Buffer{} + err := pprof.Lookup("goroutine").WriteTo(&profBuf, 2) + return profBuf, err +} + +// RotateFilenamesIfNeeded will remove old files if there are more than +// 10 matching the given filename pattern. +func RotateFilenamesIfNeeded(filename string) error { + existingFiles, err := filepath.Glob(filename) + if err != nil { + return fmt.Errorf("Error listing existing profiles in %q: %w", filename, err) + } + if len(existingFiles) <= 10 { + return nil + } + slices.Reverse(existingFiles) + var multiErr *MultiError + for _, profile := range existingFiles[10:] { + err = os.Remove(profile) + if err != nil { + multiErr = multiErr.Append(fmt.Errorf("Error removing old profile %q: %w", profile, err)) + } + } + return multiErr.ErrorOrNil() +} + +func LogStackTraces(ctx context.Context, logDirectory string, stackTrace bytes.Buffer, timestamp string) { + + // log to console + _, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace.String()) + + err := writeStackTraceFile(ctx, logDirectory, timestamp, stackTrace) + if err != nil { + return + } + + rotatePath := filepath.Join(logDirectory, StackFilePrefix+"*.log") + err = RotateFilenamesIfNeeded(rotatePath) + if err != nil { + WarnfCtx(ctx, "Error rotating stack trace files in path %s: %v", rotatePath, err) + } +} + +func writeStackTraceFile(ctx context.Context, logDirectory, timestamp string, stackTrace bytes.Buffer) error { + filename := filepath.Join(logDirectory, StackFilePrefix+timestamp+".log") + file, err := os.Create(filename) + defer func() { + closeErr := file.Close() + if closeErr != nil { + WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, closeErr) + } + }() + if err != nil { + WarnfCtx(ctx, "Error opening stack trace file %s: %v", filename, err) + return err + } + + _, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace.String())) + if err != nil { + WarnfCtx(ctx, "Error writing stack trace to file %s: %v", filename, err) + return err + } + return nil +} diff --git a/rest/adminapitest/admin_api_test.go b/rest/adminapitest/admin_api_test.go index 89045ba4cb..0078a826fc 100644 --- a/rest/adminapitest/admin_api_test.go +++ b/rest/adminapitest/admin_api_test.go @@ -2269,6 +2269,17 @@ func TestHandleSGCollect(t *testing.T) { rest.RequireStatus(t, resp, http.StatusBadRequest) } +func TestHandleGetStackTrace(t *testing.T) { + rt := rest.NewRestTester(t, nil) + defer rt.Close() + + resp := rt.SendAdminRequest(http.MethodGet, "/_debug/pprof/goroutine?debug=2", "") + rest.RequireStatus(t, resp, http.StatusOK) + rawResponseStr := resp.Body.String() + assert.Contains(t, rawResponseStr, "goroutine") + assert.Contains(t, rawResponseStr, "handlePprofGoroutine") +} + func TestConfigRedaction(t *testing.T) { base.LongRunningTest(t) diff --git a/rest/config.go b/rest/config.go index 92749fc22f..36682771b7 100644 --- a/rest/config.go +++ b/rest/config.go @@ -22,12 +22,10 @@ import ( _ "net/http/pprof" "net/url" "os" - "os/signal" "strconv" "strings" "sync" "sync/atomic" - "syscall" "time" "github.com/go-jose/go-jose/v4" @@ -2315,29 +2313,6 @@ func HandleSighup(ctx context.Context) { base.RotateLogfiles(ctx) } -// RegisterSignalHandler invokes functions based on the given signals: -// - SIGHUP causes Sync Gateway to rotate log files. -// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly. -// - SIGKILL cannot be handled by the application. -func RegisterSignalHandler(ctx context.Context) { - signalChannel := make(chan os.Signal, 1) - signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM) - - go func() { - for sig := range signalChannel { - base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) - switch sig { - case syscall.SIGHUP: - HandleSighup(ctx) - default: - // Ensure log buffers are flushed before exiting. - base.FlushLogBuffers() - os.Exit(130) // 130 == exit code 128 + 2 (interrupt) - } - } - }() -} - // toDbLogConfig converts the stored logging in a DbConfig to a runtime DbLogConfig for evaluation at log time. // This is required to turn the stored config (which does not have data stored in a O(1)-compatible format) into a data structure that has O(1) lookups for checking if we should log. func (c *DbConfig) toDbLogConfig(ctx context.Context) *base.DbLogConfig { diff --git a/rest/main.go b/rest/main.go index 3fe7f3c812..d8e0fae77d 100644 --- a/rest/main.go +++ b/rest/main.go @@ -16,7 +16,9 @@ import ( "fmt" "io" "os" + "os/signal" "path/filepath" + "runtime" "strings" "time" @@ -37,7 +39,7 @@ func ServerMain() { // TODO: Pass ctx down into HTTP servers so that serverMain can be stopped. func serverMain(ctx context.Context, osArgs []string) error { - RegisterSignalHandler(ctx) + sigChan := RegisterSignalHandler(ctx, "") defer base.FatalPanicHandler() base.InitializeMemoryLoggers() @@ -56,7 +58,7 @@ func serverMain(ctx context.Context, osArgs []string) error { return legacyServerMain(ctx, osArgs, flagStartupConfig) } - disablePersistentConfigFallback, err := serverMainPersistentConfig(ctx, fs, flagStartupConfig) + disablePersistentConfigFallback, err := serverMainPersistentConfig(ctx, fs, flagStartupConfig, sigChan) if disablePersistentConfigFallback { return legacyServerMain(ctx, osArgs, flagStartupConfig) } @@ -65,7 +67,7 @@ func serverMain(ctx context.Context, osArgs []string) error { } // serverMainPersistentConfig runs the Sync Gateway server with persistent config. -func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStartupConfig *StartupConfig) (disablePersistentConfigFallback bool, err error) { +func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStartupConfig *StartupConfig, sigChan chan os.Signal) (disablePersistentConfigFallback bool, err error) { sc := DefaultStartupConfig(defaultLogFilePath) base.TracefCtx(ctx, base.KeyAll, "default config: %#v", sc) @@ -148,6 +150,14 @@ func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStart return false, err } + if runtime.GOOS != "windows" { + // stop signal handlers and register for stack trace handling to be able to log to configured directory, + // this is not supported for windows environments + signal.Stop(sigChan) + close(sigChan) + RegisterSignalHandler(ctx, svrctx.Config.Logging.LogFilePath) + } + svrctx.initialStartupConfig = initialStartupConfig svrctx.addLegacyPrincipals(ctx, legacyDbUsers, legacyDbRoles) diff --git a/rest/register_handler_unix.go b/rest/register_handler_unix.go new file mode 100644 index 0000000000..d87fa1f99c --- /dev/null +++ b/rest/register_handler_unix.go @@ -0,0 +1,60 @@ +//go:build !windows +// +build !windows + +/* +Copyright 2025-Present Couchbase, Inc. + +Use of this software is governed by the Business Source License included in +the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that +file, in accordance with the Business Source License, use of this software will +be governed by the Apache License, Version 2.0, included in the file +licenses/APL2.txt. +*/ + +package rest + +import ( + "context" + "os" + "os/signal" + "syscall" + "time" + + "github.com/couchbase/sync_gateway/base" +) + +// RegisterSignalHandler invokes functions based on the given signals for unix environments: +// - SIGHUP causes Sync Gateway to rotate log files. +// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly. +// - SIGKILL cannot be handled by the application. +// - SIGUSR1 causes Sync Gateway to log stack traces for all goroutines. +func RegisterSignalHandler(ctx context.Context, logDirectory string) chan os.Signal { + signalChannel := make(chan os.Signal, 1) + signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM, syscall.SIGUSR1) + + go func() { + for sig := range signalChannel { + base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) + switch sig { + case syscall.SIGHUP: + HandleSighup(ctx) + case syscall.SIGUSR1: + stackTrace, err := base.GetStackTrace() + if err != nil { + base.WarnfCtx(ctx, "Error collecting stack trace: %v", err) + } else { + base.InfofCtx(ctx, base.KeyAll, "Collecting stack trace for all goroutines") + } + // log to console and log to file in the log directory + currentTime := time.Now() + timestamp := currentTime.Format(time.RFC3339) + base.LogStackTraces(ctx, logDirectory, stackTrace, timestamp) + default: + // Ensure log buffers are flushed before exiting. + base.FlushLogBuffers() + os.Exit(130) // 130 == exit code 128 + 2 (interrupt) + } + } + }() + return signalChannel +} diff --git a/rest/register_handler_windows.go b/rest/register_handler_windows.go new file mode 100644 index 0000000000..9992a59ea0 --- /dev/null +++ b/rest/register_handler_windows.go @@ -0,0 +1,47 @@ +//go:build windows +// +build windows + +/* +Copyright 2025-Present Couchbase, Inc. + +Use of this software is governed by the Business Source License included in +the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that +file, in accordance with the Business Source License, use of this software will +be governed by the Apache License, Version 2.0, included in the file +licenses/APL2.txt. +*/ + +package rest + +import ( + "context" + "os" + "os/signal" + "syscall" + + "github.com/couchbase/sync_gateway/base" +) + +// RegisterSignalHandler invokes functions based on the given signals for windows environments: +// - SIGHUP causes Sync Gateway to rotate log files. +// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly. +// - SIGKILL cannot be handled by the application. +func RegisterSignalHandler(ctx context.Context, logDirectory string) chan os.Signal { + signalChannel := make(chan os.Signal, 1) + signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM) + + go func() { + for sig := range signalChannel { + base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig) + switch sig { + case syscall.SIGHUP: + HandleSighup(ctx) + default: + // Ensure log buffers are flushed before exiting. + base.FlushLogBuffers() + os.Exit(130) // 130 == exit code 128 + 2 (interrupt) + } + } + }() + return signalChannel +} diff --git a/rest/server_context_test.go b/rest/server_context_test.go index 9ad335e70a..5c78ac614d 100644 --- a/rest/server_context_test.go +++ b/rest/server_context_test.go @@ -16,6 +16,7 @@ import ( "net/http" "os" "path/filepath" + "slices" "strconv" "strings" "sync" @@ -1062,3 +1063,34 @@ func TestDatabaseCollectionDeletedErrorState(t *testing.T) { allDbs = rt.ServerContext().allDatabaseSummaries() require.Len(t, allDbs, 1) } + +func TestCollectStackTraceFile(t *testing.T) { + base.SetUpTestLogging(t, base.LevelInfo, base.KeyAll) + + tempPath := t.TempDir() + serverConfig := DefaultStartupConfig(tempPath) + serverConfig.Logging.LogFilePath = tempPath + ctx := t.Context() + serverContext := NewServerContext(ctx, &serverConfig, false) + defer serverContext.Close(ctx) + + timeStamp := "01" + stackTrace, err := base.GetStackTrace() + require.NoError(t, err) + base.LogStackTraces(ctx, serverConfig.Logging.LogFilePath, stackTrace, timeStamp) + require.Len(t, getFilenames(t, tempPath), 1) + assert.True(t, slices.Contains(getFilenames(t, tempPath), base.StackFilePrefix+timeStamp+".log")) + + // trigger rotation and assert we don't go above 10 files + expectedFiles := make([]string, 0, 10) + for i := 2; i < 12; i++ { + timeStamp = fmt.Sprintf("%d", i+2) + trace, err := base.GetStackTrace() + require.NoError(t, err) + base.LogStackTraces(ctx, serverConfig.Logging.LogFilePath, trace, timeStamp) + expectedFiles = append(expectedFiles, base.StackFilePrefix+timeStamp+".log") + } + files := getFilenames(t, tempPath) + require.Len(t, files, 10) + require.ElementsMatch(t, files, expectedFiles) +} diff --git a/rest/stats_context.go b/rest/stats_context.go index 7d48d8d0cf..6b8e5b65fd 100644 --- a/rest/stats_context.go +++ b/rest/stats_context.go @@ -18,7 +18,6 @@ import ( "path/filepath" "runtime" "runtime/pprof" - "slices" "sync/atomic" "time" @@ -407,20 +406,8 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp if err != nil { return fmt.Errorf("Error writing memory profile to %q: %w", filename, err) } - existingProfiles, err := filepath.Glob(filepath.Join(outputDir, pprofPrefix+"*.pb.gz")) - if err != nil { - return fmt.Errorf("Error listing existing memory profiles in %q: %w", outputDir, err) - } - if len(existingProfiles) <= 10 { - return nil - } - slices.Reverse(existingProfiles) - var multiErr *base.MultiError - for _, profile := range existingProfiles[10:] { - err = os.Remove(profile) - if err != nil { - multiErr = multiErr.Append(fmt.Errorf("Error removing old memory profile %q: %w", profile, err)) - } - } - return multiErr.ErrorOrNil() + + // rotate old profiles + path := filepath.Join(outputDir, pprofPrefix+"*.pb.gz") + return base.RotateFilenamesIfNeeded(path) } diff --git a/tools-tests/sgcollect_info_test.py b/tools-tests/sgcollect_info_test.py index fb5195f9b9..6a5b2503e0 100644 --- a/tools-tests/sgcollect_info_test.py +++ b/tools-tests/sgcollect_info_test.py @@ -72,6 +72,27 @@ def test_make_collect_logs_heap_profile(tmpdir): assert tasks[0].description.startswith("Contents of") +def test_make_collect_logs_stacktrace(tmpdir): + with unittest.mock.patch( + "sgcollect.urlopen", + return_value=io.BytesIO( + '{{"logfilepath": "{logpath}"}}'.format( + logpath=normalize_path_for_json(tmpdir), + ).encode("utf-8") + ), + ): + stacktrace_file = tmpdir.join("sg_stack_trace.log") + stacktrace_file.write("foo") + tasks = sgcollect.make_collect_logs_tasks( + sg_url="fakeurl", + sg_config_file_path="", + auth_headers={}, + ) + assert [tasks[0].log_file] == [stacktrace_file.basename] + # ensure that this is not redacted task + assert tasks[0].description.startswith("Contents of") + + @pytest.mark.parametrize("should_redact", [True, False]) def test_make_collect_logs_tasks_duplicate_files(should_redact, tmp_path): tmpdir1 = tmp_path / "tmpdir1" diff --git a/tools/sgcollect.py b/tools/sgcollect.py index 63e7e30a25..d079530917 100755 --- a/tools/sgcollect.py +++ b/tools/sgcollect.py @@ -279,6 +279,25 @@ def make_http_client_pprof_tasks( return pprof_tasks +def make_http_client_stack_trace_task( + sg_url: str, auth_headers: dict[str, str] +) -> PythonTask: + """ + This task uses the python http client to collect the raw stack trace data + """ + stack_trace_url = "{0}/_debug/pprof/goroutine?debug=2".format(sg_url) + + stack_trace_task = make_curl_task( + name="Collect stack trace via http client", + auth_headers=auth_headers, + url=stack_trace_url, + log_file="sg_stack_trace.log", + ) + stack_trace_task.no_header = True + + return stack_trace_task + + def to_lower_case_keys_dict(original_dict): result = {} for k, v in list(original_dict.items()): @@ -367,6 +386,7 @@ def make_collect_logs_tasks( "sg_debug.log": "sg_debug.log", "sg_trace.log": "sg_trace.log", "sg_stats.log": "sg_stats.log", + "sg_stack_trace.log": "sg_stack_trace.log", "sync_gateway_access.log": "sync_gateway_access.log", "sync_gateway_error.log": "sync_gateway_error.log", "pprof.pb": "pprof.pb", @@ -761,6 +781,10 @@ def make_sg_tasks( http_client_pprof_tasks = make_http_client_pprof_tasks(sg_url, auth_headers) + http_client_stack_trace_task = make_http_client_stack_trace_task( + sg_url, auth_headers + ) + # Add a task to collect Sync Gateway config config_tasks = make_config_tasks( sg_config_path, sg_url, auth_headers, should_redact @@ -781,6 +805,7 @@ def make_sg_tasks( collect_logs_tasks, py_expvar_task, http_client_pprof_tasks, + http_client_stack_trace_task, config_tasks, status_tasks, ]