Skip to content

Commit f4e6253

Browse files
authored
CBG-4714: collect goroutine stack traces on signal (#7870)
1 parent 43bdaa8 commit f4e6253

File tree

11 files changed

+286
-45
lines changed

11 files changed

+286
-45
lines changed

base/constants.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,9 @@ const (
155155

156156
// FromConnStrWarningThreshold determines the amount of time it should take before we warn about parsing a connstr (mostly for DNS resolution)
157157
FromConnStrWarningThreshold = 10 * time.Second
158+
159+
// StackFilePrefix is the prefix used when writing stack trace files
160+
StackFilePrefix = "sg_stack_trace_"
158161
)
159162

160163
// SyncGatewayRawDocXattrs is a list of xattrs that Sync Gateway will fetch when reading a raw document.

base/util.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,13 @@ import (
2727
"net"
2828
"net/http"
2929
"net/url"
30+
"os"
31+
"path/filepath"
3032
"reflect"
3133
"regexp"
3234
"runtime"
3335
"runtime/debug"
36+
"runtime/pprof"
3437
"slices"
3538
"sort"
3639
"strconv"
@@ -1831,3 +1834,70 @@ func IsRevTreeID(s string) bool {
18311834
}
18321835
return false
18331836
}
1837+
1838+
// GetStackTrace will return goroutine stack traces for all goroutines in Sync Gateway.
1839+
func GetStackTrace() (bytes.Buffer, error) {
1840+
profBuf := bytes.Buffer{}
1841+
err := pprof.Lookup("goroutine").WriteTo(&profBuf, 2)
1842+
return profBuf, err
1843+
}
1844+
1845+
// RotateFilenamesIfNeeded will remove old files if there are more than
1846+
// 10 matching the given filename pattern.
1847+
func RotateFilenamesIfNeeded(filename string) error {
1848+
existingFiles, err := filepath.Glob(filename)
1849+
if err != nil {
1850+
return fmt.Errorf("Error listing existing profiles in %q: %w", filename, err)
1851+
}
1852+
if len(existingFiles) <= 10 {
1853+
return nil
1854+
}
1855+
slices.Reverse(existingFiles)
1856+
var multiErr *MultiError
1857+
for _, profile := range existingFiles[10:] {
1858+
err = os.Remove(profile)
1859+
if err != nil {
1860+
multiErr = multiErr.Append(fmt.Errorf("Error removing old profile %q: %w", profile, err))
1861+
}
1862+
}
1863+
return multiErr.ErrorOrNil()
1864+
}
1865+
1866+
func LogStackTraces(ctx context.Context, logDirectory string, stackTrace bytes.Buffer, timestamp string) {
1867+
1868+
// log to console
1869+
_, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace.String())
1870+
1871+
err := writeStackTraceFile(ctx, logDirectory, timestamp, stackTrace)
1872+
if err != nil {
1873+
return
1874+
}
1875+
1876+
rotatePath := filepath.Join(logDirectory, StackFilePrefix+"*.log")
1877+
err = RotateFilenamesIfNeeded(rotatePath)
1878+
if err != nil {
1879+
WarnfCtx(ctx, "Error rotating stack trace files in path %s: %v", rotatePath, err)
1880+
}
1881+
}
1882+
1883+
func writeStackTraceFile(ctx context.Context, logDirectory, timestamp string, stackTrace bytes.Buffer) error {
1884+
filename := filepath.Join(logDirectory, StackFilePrefix+timestamp+".log")
1885+
file, err := os.Create(filename)
1886+
defer func() {
1887+
closeErr := file.Close()
1888+
if closeErr != nil {
1889+
WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, closeErr)
1890+
}
1891+
}()
1892+
if err != nil {
1893+
WarnfCtx(ctx, "Error opening stack trace file %s: %v", filename, err)
1894+
return err
1895+
}
1896+
1897+
_, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace.String()))
1898+
if err != nil {
1899+
WarnfCtx(ctx, "Error writing stack trace to file %s: %v", filename, err)
1900+
return err
1901+
}
1902+
return nil
1903+
}

rest/adminapitest/admin_api_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2292,6 +2292,17 @@ func TestHandleSGCollect(t *testing.T) {
22922292
rest.RequireStatus(t, resp, http.StatusBadRequest)
22932293
}
22942294

2295+
func TestHandleGetStackTrace(t *testing.T) {
2296+
rt := rest.NewRestTester(t, nil)
2297+
defer rt.Close()
2298+
2299+
resp := rt.SendAdminRequest(http.MethodGet, "/_debug/pprof/goroutine?debug=2", "")
2300+
rest.RequireStatus(t, resp, http.StatusOK)
2301+
rawResponseStr := resp.Body.String()
2302+
assert.Contains(t, rawResponseStr, "goroutine")
2303+
assert.Contains(t, rawResponseStr, "handlePprofGoroutine")
2304+
}
2305+
22952306
func TestConfigRedaction(t *testing.T) {
22962307
base.LongRunningTest(t)
22972308

rest/config.go

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,10 @@ import (
2222
_ "net/http/pprof"
2323
"net/url"
2424
"os"
25-
"os/signal"
2625
"strconv"
2726
"strings"
2827
"sync"
2928
"sync/atomic"
30-
"syscall"
3129
"time"
3230

3331
"github.com/go-jose/go-jose/v4"
@@ -2318,29 +2316,6 @@ func HandleSighup(ctx context.Context) {
23182316
base.RotateLogfiles(ctx)
23192317
}
23202318

2321-
// RegisterSignalHandler invokes functions based on the given signals:
2322-
// - SIGHUP causes Sync Gateway to rotate log files.
2323-
// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly.
2324-
// - SIGKILL cannot be handled by the application.
2325-
func RegisterSignalHandler(ctx context.Context) {
2326-
signalChannel := make(chan os.Signal, 1)
2327-
signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM)
2328-
2329-
go func() {
2330-
for sig := range signalChannel {
2331-
base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig)
2332-
switch sig {
2333-
case syscall.SIGHUP:
2334-
HandleSighup(ctx)
2335-
default:
2336-
// Ensure log buffers are flushed before exiting.
2337-
base.FlushLogBuffers()
2338-
os.Exit(130) // 130 == exit code 128 + 2 (interrupt)
2339-
}
2340-
}
2341-
}()
2342-
}
2343-
23442319
// toDbLogConfig converts the stored logging in a DbConfig to a runtime DbLogConfig for evaluation at log time.
23452320
// This is required to turn the stored config (which does not have data stored in a O(1)-compatible format) into a data structure that has O(1) lookups for checking if we should log.
23462321
func (c *DbConfig) toDbLogConfig(ctx context.Context) *base.DbLogConfig {

rest/main.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ import (
1616
"fmt"
1717
"io"
1818
"os"
19+
"os/signal"
1920
"path/filepath"
21+
"runtime"
2022
"strings"
2123
"time"
2224

@@ -37,7 +39,7 @@ func ServerMain() {
3739

3840
// TODO: Pass ctx down into HTTP servers so that serverMain can be stopped.
3941
func serverMain(ctx context.Context, osArgs []string) error {
40-
RegisterSignalHandler(ctx)
42+
sigChan := RegisterSignalHandler(ctx, "")
4143
defer base.FatalPanicHandler()
4244

4345
base.InitializeMemoryLoggers()
@@ -56,7 +58,7 @@ func serverMain(ctx context.Context, osArgs []string) error {
5658
return legacyServerMain(ctx, osArgs, flagStartupConfig)
5759
}
5860

59-
disablePersistentConfigFallback, err := serverMainPersistentConfig(ctx, fs, flagStartupConfig)
61+
disablePersistentConfigFallback, err := serverMainPersistentConfig(ctx, fs, flagStartupConfig, sigChan)
6062
if disablePersistentConfigFallback {
6163
return legacyServerMain(ctx, osArgs, flagStartupConfig)
6264
}
@@ -65,7 +67,7 @@ func serverMain(ctx context.Context, osArgs []string) error {
6567
}
6668

6769
// serverMainPersistentConfig runs the Sync Gateway server with persistent config.
68-
func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStartupConfig *StartupConfig) (disablePersistentConfigFallback bool, err error) {
70+
func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStartupConfig *StartupConfig, sigChan chan os.Signal) (disablePersistentConfigFallback bool, err error) {
6971

7072
sc := DefaultStartupConfig(defaultLogFilePath)
7173
base.TracefCtx(ctx, base.KeyAll, "default config: %#v", sc)
@@ -148,6 +150,14 @@ func serverMainPersistentConfig(ctx context.Context, fs *flag.FlagSet, flagStart
148150
return false, err
149151
}
150152

153+
if runtime.GOOS != "windows" {
154+
// stop signal handlers and register for stack trace handling to be able to log to configured directory,
155+
// this is not supported for windows environments
156+
signal.Stop(sigChan)
157+
close(sigChan)
158+
RegisterSignalHandler(ctx, svrctx.Config.Logging.LogFilePath)
159+
}
160+
151161
svrctx.initialStartupConfig = initialStartupConfig
152162

153163
svrctx.addLegacyPrincipals(ctx, legacyDbUsers, legacyDbRoles)

rest/register_handler_unix.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
//go:build !windows
2+
// +build !windows
3+
4+
/*
5+
Copyright 2025-Present Couchbase, Inc.
6+
7+
Use of this software is governed by the Business Source License included in
8+
the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that
9+
file, in accordance with the Business Source License, use of this software will
10+
be governed by the Apache License, Version 2.0, included in the file
11+
licenses/APL2.txt.
12+
*/
13+
14+
package rest
15+
16+
import (
17+
"context"
18+
"os"
19+
"os/signal"
20+
"syscall"
21+
"time"
22+
23+
"github.com/couchbase/sync_gateway/base"
24+
)
25+
26+
// RegisterSignalHandler invokes functions based on the given signals for unix environments:
27+
// - SIGHUP causes Sync Gateway to rotate log files.
28+
// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly.
29+
// - SIGKILL cannot be handled by the application.
30+
// - SIGUSR1 causes Sync Gateway to log stack traces for all goroutines.
31+
func RegisterSignalHandler(ctx context.Context, logDirectory string) chan os.Signal {
32+
signalChannel := make(chan os.Signal, 1)
33+
signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM, syscall.SIGUSR1)
34+
35+
go func() {
36+
for sig := range signalChannel {
37+
base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig)
38+
switch sig {
39+
case syscall.SIGHUP:
40+
HandleSighup(ctx)
41+
case syscall.SIGUSR1:
42+
stackTrace, err := base.GetStackTrace()
43+
if err != nil {
44+
base.WarnfCtx(ctx, "Error collecting stack trace: %v", err)
45+
} else {
46+
base.InfofCtx(ctx, base.KeyAll, "Collecting stack trace for all goroutines")
47+
}
48+
// log to console and log to file in the log directory
49+
currentTime := time.Now()
50+
timestamp := currentTime.Format(time.RFC3339)
51+
base.LogStackTraces(ctx, logDirectory, stackTrace, timestamp)
52+
default:
53+
// Ensure log buffers are flushed before exiting.
54+
base.FlushLogBuffers()
55+
os.Exit(130) // 130 == exit code 128 + 2 (interrupt)
56+
}
57+
}
58+
}()
59+
return signalChannel
60+
}

rest/register_handler_windows.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//go:build windows
2+
// +build windows
3+
4+
/*
5+
Copyright 2025-Present Couchbase, Inc.
6+
7+
Use of this software is governed by the Business Source License included in
8+
the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that
9+
file, in accordance with the Business Source License, use of this software will
10+
be governed by the Apache License, Version 2.0, included in the file
11+
licenses/APL2.txt.
12+
*/
13+
14+
package rest
15+
16+
import (
17+
"context"
18+
"os"
19+
"os/signal"
20+
"syscall"
21+
22+
"github.com/couchbase/sync_gateway/base"
23+
)
24+
25+
// RegisterSignalHandler invokes functions based on the given signals for windows environments:
26+
// - SIGHUP causes Sync Gateway to rotate log files.
27+
// - SIGINT or SIGTERM causes Sync Gateway to exit cleanly.
28+
// - SIGKILL cannot be handled by the application.
29+
func RegisterSignalHandler(ctx context.Context, logDirectory string) chan os.Signal {
30+
signalChannel := make(chan os.Signal, 1)
31+
signal.Notify(signalChannel, syscall.SIGHUP, os.Interrupt, syscall.SIGTERM)
32+
33+
go func() {
34+
for sig := range signalChannel {
35+
base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig)
36+
switch sig {
37+
case syscall.SIGHUP:
38+
HandleSighup(ctx)
39+
default:
40+
// Ensure log buffers are flushed before exiting.
41+
base.FlushLogBuffers()
42+
os.Exit(130) // 130 == exit code 128 + 2 (interrupt)
43+
}
44+
}
45+
}()
46+
return signalChannel
47+
}

rest/server_context_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"net/http"
1717
"os"
1818
"path/filepath"
19+
"slices"
1920
"strconv"
2021
"strings"
2122
"sync"
@@ -1062,3 +1063,34 @@ func TestDatabaseCollectionDeletedErrorState(t *testing.T) {
10621063
allDbs = rt.ServerContext().allDatabaseSummaries()
10631064
require.Len(t, allDbs, 1)
10641065
}
1066+
1067+
func TestCollectStackTraceFile(t *testing.T) {
1068+
base.SetUpTestLogging(t, base.LevelInfo, base.KeyAll)
1069+
1070+
tempPath := t.TempDir()
1071+
serverConfig := DefaultStartupConfig(tempPath)
1072+
serverConfig.Logging.LogFilePath = tempPath
1073+
ctx := t.Context()
1074+
serverContext := NewServerContext(ctx, &serverConfig, false)
1075+
defer serverContext.Close(ctx)
1076+
1077+
timeStamp := "01"
1078+
stackTrace, err := base.GetStackTrace()
1079+
require.NoError(t, err)
1080+
base.LogStackTraces(ctx, serverConfig.Logging.LogFilePath, stackTrace, timeStamp)
1081+
require.Len(t, getFilenames(t, tempPath), 1)
1082+
assert.True(t, slices.Contains(getFilenames(t, tempPath), base.StackFilePrefix+timeStamp+".log"))
1083+
1084+
// trigger rotation and assert we don't go above 10 files
1085+
expectedFiles := make([]string, 0, 10)
1086+
for i := 2; i < 12; i++ {
1087+
timeStamp = fmt.Sprintf("%d", i+2)
1088+
trace, err := base.GetStackTrace()
1089+
require.NoError(t, err)
1090+
base.LogStackTraces(ctx, serverConfig.Logging.LogFilePath, trace, timeStamp)
1091+
expectedFiles = append(expectedFiles, base.StackFilePrefix+timeStamp+".log")
1092+
}
1093+
files := getFilenames(t, tempPath)
1094+
require.Len(t, files, 10)
1095+
require.ElementsMatch(t, files, expectedFiles)
1096+
}

rest/stats_context.go

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import (
1818
"path/filepath"
1919
"runtime"
2020
"runtime/pprof"
21-
"slices"
2221
"sync/atomic"
2322
"time"
2423

@@ -407,20 +406,8 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp
407406
if err != nil {
408407
return fmt.Errorf("Error writing memory profile to %q: %w", filename, err)
409408
}
410-
existingProfiles, err := filepath.Glob(filepath.Join(outputDir, pprofPrefix+"*.pb.gz"))
411-
if err != nil {
412-
return fmt.Errorf("Error listing existing memory profiles in %q: %w", outputDir, err)
413-
}
414-
if len(existingProfiles) <= 10 {
415-
return nil
416-
}
417-
slices.Reverse(existingProfiles)
418-
var multiErr *base.MultiError
419-
for _, profile := range existingProfiles[10:] {
420-
err = os.Remove(profile)
421-
if err != nil {
422-
multiErr = multiErr.Append(fmt.Errorf("Error removing old memory profile %q: %w", profile, err))
423-
}
424-
}
425-
return multiErr.ErrorOrNil()
409+
410+
// rotate old profiles
411+
path := filepath.Join(outputDir, pprofPrefix+"*.pb.gz")
412+
return base.RotateFilenamesIfNeeded(path)
426413
}

0 commit comments

Comments
 (0)