Skip to content
39 changes: 39 additions & 0 deletions base/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ import (
"net"
"net/http"
"net/url"
"os"
"path/filepath"
"reflect"
"regexp"
"runtime"
Expand Down Expand Up @@ -1831,3 +1833,40 @@ func IsRevTreeID(s string) bool {
}
return false
}

// GetStackTrace will return goroutine stack traces for all goroutines in Sync Gateway.
func GetStackTrace() string {
// make 1MB buffer but if this buffer isn't big enough, runtime.Stack will
// return nothing, thus have 5 retires doubling the capacity each time.
buf := make([]byte, 1<<20)
for range 5 {
n := runtime.Stack(buf, true)
if n < len(buf) {
buf = buf[:n]
break
}
buf = make([]byte, 2*len(buf))
}
return string(buf)
}

// RotateProfilesIfNeeded will remove old files if there are more than
// 10 matching the given filename pattern.
func RotateProfilesIfNeeded(filename string) error {
existingFiles, err := filepath.Glob(filename)
if err != nil {
return fmt.Errorf("Error listing existing profiles in %q: %w", filename, err)
}
if len(existingFiles) <= 10 {
return nil
}
slices.Reverse(existingFiles)
var multiErr *MultiError
for _, profile := range existingFiles[10:] {
err = os.Remove(profile)
if err != nil {
multiErr = multiErr.Append(fmt.Errorf("Error removing old profile %q: %w", profile, err))
}
}
return multiErr.ErrorOrNil()
}
2 changes: 2 additions & 0 deletions docs/api/admin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ paths:
$ref: ./paths/admin/_debug-pprof-trace.yaml
/_debug/fgprof:
$ref: ./paths/admin/_debug-fgprof.yaml
/_debug/stacktrace:
$ref: ./paths/admin/_debug-stack_trace.yaml
/_post_upgrade:
$ref: ./paths/admin/_post_upgrade.yaml
'/{db}/_config':
Expand Down
25 changes: 25 additions & 0 deletions docs/api/paths/admin/_debug-stack_trace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2025-Present Couchbase, Inc.
#
# Use of this software is governed by the Business Source License included
# in the file licenses/BSL-Couchbase.txt. As of the Change Date specified
# in that file, in accordance with the Business Source License, use of this
# software will be governed by the Apache License, Version 2.0, included in
# the file licenses/APL2.txt.
get:
summary: Get stack trace for all goroutines
description: |-
Returns stack traces of all running goroutines in Sync Gateway.

Required Sync Gateway RBAC roles:

* Sync Gateway Dev Ops
responses:
'200':
description: OK
content:
application/json:
schema:
type: string
tags:
- Profiling
operationId: get__debug-stacktrace
11 changes: 11 additions & 0 deletions rest/adminapitest/admin_api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2269,6 +2269,17 @@ func TestHandleSGCollect(t *testing.T) {
rest.RequireStatus(t, resp, http.StatusBadRequest)
}

func TestHandleGetStackTrace(t *testing.T) {
rt := rest.NewRestTester(t, nil)
defer rt.Close()

resp := rt.SendAdminRequest(http.MethodGet, "/_debug/stacktrace", "")
rest.RequireStatus(t, resp, http.StatusOK)
rawResponseStr := resp.Body.String()
assert.Contains(t, rawResponseStr, "goroutine")
assert.Contains(t, rawResponseStr, "handleCollectStackTrace")
}

func TestConfigRedaction(t *testing.T) {
base.LongRunningTest(t)

Expand Down
8 changes: 8 additions & 0 deletions rest/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,14 @@ func (h *handler) handleFgprof() error {
return stopFn()
}

func (h *handler) handleCollectStackTrace() error {

stackTrace := base.GetStackTrace()

h.writeText([]byte(stackTrace))
return nil
}

func (h *handler) handlePprofBlock() error {
sec, err := strconv.ParseInt(h.rq.FormValue("seconds"), 10, 64)
if sec <= 0 || err != nil {
Expand Down
2 changes: 2 additions & 0 deletions rest/routing.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,8 @@ func CreateAdminRouter(sc *ServerContext) *mux.Router {
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePprofTrace, handlerOptions{sgcollect: true})).Methods("GET", "POST")
r.Handle("/_debug/fgprof",
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleFgprof, handlerOptions{sgcollect: true})).Methods("GET", "POST")
r.Handle("/_debug/stacktrace",
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleCollectStackTrace, handlerOptions{sgcollect: true})).Methods("GET")

r.Handle("/_post_upgrade",
makeHandler(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePostUpgrade)).Methods("POST")
Expand Down
46 changes: 46 additions & 0 deletions rest/server_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"net"
"net/http"
"os"
"path/filepath"
"runtime"
"slices"
"sort"
"strconv"
Expand Down Expand Up @@ -59,6 +61,8 @@ var ErrSuspendingDisallowed = errors.New("database does not allow suspending")

var allServers = []serverType{publicServer, adminServer, metricsServer, diagnosticServer}

const stackFilePrefix = "sg_stack_trace_"

// serverInfo represents an instance of an HTTP server from sync gateway
type serverInfo struct {
server *http.Server // server is the HTTP server instance
Expand Down Expand Up @@ -93,6 +97,7 @@ type ServerContext struct {
DatabaseInitManager *DatabaseInitManager // Manages database initialization (index creation and readiness) independent of database stop/start/reload, when using persistent config
ActiveReplicationsCounter
invalidDatabaseConfigTracking invalidDatabaseConfigs
signalContextFunc context.CancelFunc
// handle sgcollect processes for a given Server
SGCollect *sgCollect
}
Expand Down Expand Up @@ -204,6 +209,12 @@ func NewServerContext(ctx context.Context, config *StartupConfig, persistentConf

sc.startStatsLogger(ctx)

if runtime.GOOS != "windows" {
signalCtx, cancelFunc := context.WithCancel(ctx)
sc.signalContextFunc = cancelFunc
sc.registerSignalHandlerForStackTrace(signalCtx)
}

return sc
}

Expand Down Expand Up @@ -275,6 +286,9 @@ func (sc *ServerContext) Close(ctx context.Context) {
base.InfofCtx(ctx, base.KeyAll, "Couldn't stop background config update worker: %v", err)
}

// cancel any signal handlers
sc.signalContextFunc()

sc.lock.Lock()
defer sc.lock.Unlock()

Expand Down Expand Up @@ -1844,6 +1858,38 @@ func (sc *ServerContext) logStats(ctx context.Context) error {

}

func (sc *ServerContext) logStackTraces(ctx context.Context, timestamp string) {

base.InfofCtx(ctx, base.KeyAll, "Collecting stack trace for all goroutines")
stackTrace := base.GetStackTrace()

// log to console
_, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace)

filename := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+timestamp+".log")
file, err := os.Create(filename)
defer func() {
closeErr := file.Close()
if closeErr != nil {
base.WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, closeErr)
}
}()
if err != nil {
base.WarnfCtx(ctx, "Error opening stack trace file %s: %v", filename, err)
}

_, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace))
if err != nil {
base.WarnfCtx(ctx, "Error writing stack trace to file %s: %v", filename, err)
}

rotatePath := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+"*.log")
err = base.RotateProfilesIfNeeded(rotatePath)
if err != nil {
base.WarnfCtx(ctx, "Error rotating stack trace files in path %s: %v", rotatePath, err)
}
}

func (sc *ServerContext) logNetworkInterfaceStats(ctx context.Context) {

if err := sc.statsContext.addPublicNetworkInterfaceStatsForHostnamePort(sc.Config.API.PublicInterface); err != nil {
Expand Down
28 changes: 28 additions & 0 deletions rest/server_context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"net/http"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -1062,3 +1063,30 @@ func TestDatabaseCollectionDeletedErrorState(t *testing.T) {
allDbs = rt.ServerContext().allDatabaseSummaries()
require.Len(t, allDbs, 1)
}

func TestCollectStackTraceFile(t *testing.T) {
base.SetUpTestLogging(t, base.LevelInfo, base.KeyAll)

tempPath := t.TempDir()
serverConfig := DefaultStartupConfig(tempPath)
serverConfig.Logging.LogFilePath = tempPath
ctx := t.Context()
serverContext := NewServerContext(ctx, &serverConfig, false)
defer serverContext.Close(ctx)

timeStamp := "01"
serverContext.logStackTraces(ctx, timeStamp)
require.Len(t, getFilenames(t, tempPath), 1)
assert.True(t, slices.Contains(getFilenames(t, tempPath), stackFilePrefix+timeStamp+".log"))

// trigger rotation and assert we don't go above 10 files
expectedFiles := make([]string, 0, 10)
for i := 2; i < 12; i++ {
timeStamp = fmt.Sprintf("%d", i+2)
serverContext.logStackTraces(ctx, timeStamp)
expectedFiles = append(expectedFiles, stackFilePrefix+timeStamp+".log")
}
files := getFilenames(t, tempPath)
require.Len(t, files, 10)
require.ElementsMatch(t, files, expectedFiles)
}
54 changes: 54 additions & 0 deletions rest/stack_trace_handler_uinx.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
//go:build !windows
// +build !windows

/*
Copyright 2025-Present Couchbase, Inc.
Use of this software is governed by the Business Source License included in
the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that
file, in accordance with the Business Source License, use of this software will
be governed by the Apache License, Version 2.0, included in the file
licenses/APL2.txt.
*/

package rest

import (
"context"
"os"
"os/signal"
"syscall"
"time"

"github.com/couchbase/sync_gateway/base"
)

// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces
// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines.
func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) {
signalChannel := make(chan os.Signal, 1)
signal.Notify(signalChannel, syscall.SIGUSR1)

defer func() {
signal.Stop(signalChannel)
close(signalChannel)
}()

go func() {
select {
case sig := <-signalChannel:
base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig)
switch sig {
case syscall.SIGUSR1:
// stack trace signal received
currentTime := time.Now()
timestamp := currentTime.Format(time.RFC3339)
sc.logStackTraces(ctx, timestamp)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this might be good to log that we are logging the stack trace to stderr, and also log the stack stack trace with the traditional InfofCtx logging so that it gets picked up if someone isn't grabbing stderr output.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have logging to indicate that a stack trace is being collected but feel logging at stderr and Info level and writing to the file is over kill. I have left out logging to info level given we write this stuff in a file for sgcollect to collect up anyway.

default:
// unhandled signal here
}
case <-ctx.Done():
return
}
}()
}
20 changes: 20 additions & 0 deletions rest/stack_trace_handler_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//go:build windows
// +build windows

/*
Copyright 2025-Present Couchbase, Inc.

Use of this software is governed by the Business Source License included in
the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that
file, in accordance with the Business Source License, use of this software will
be governed by the Apache License, Version 2.0, included in the file
licenses/APL2.txt.
*/

package rest

// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces
// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines.
func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) {

Check failure on line 18 in rest/stack_trace_handler_windows.go

View workflow job for this annotation

GitHub Actions / test (windows)

undefined: context

Check failure on line 18 in rest/stack_trace_handler_windows.go

View workflow job for this annotation

GitHub Actions / test (windows)

undefined: context
// No-op on Windows
}
21 changes: 4 additions & 17 deletions rest/stats_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"path/filepath"
"runtime"
"runtime/pprof"
"slices"
"sync/atomic"
"time"

Expand Down Expand Up @@ -407,20 +406,8 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp
if err != nil {
return fmt.Errorf("Error writing memory profile to %q: %w", filename, err)
}
existingProfiles, err := filepath.Glob(filepath.Join(outputDir, pprofPrefix+"*.pb.gz"))
if err != nil {
return fmt.Errorf("Error listing existing memory profiles in %q: %w", outputDir, err)
}
if len(existingProfiles) <= 10 {
return nil
}
slices.Reverse(existingProfiles)
var multiErr *base.MultiError
for _, profile := range existingProfiles[10:] {
err = os.Remove(profile)
if err != nil {
multiErr = multiErr.Append(fmt.Errorf("Error removing old memory profile %q: %w", profile, err))
}
}
return multiErr.ErrorOrNil()

// rotate old profiles
path := filepath.Join(outputDir, pprofPrefix+"*.pb.gz")
return base.RotateProfilesIfNeeded(path)
}
21 changes: 21 additions & 0 deletions tools-tests/sgcollect_info_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ def test_make_collect_logs_heap_profile(tmpdir):
assert tasks[0].description.startswith("Contents of")


def test_make_collect_logs_stacktrace(tmpdir):
with unittest.mock.patch(
"sgcollect.urlopen",
return_value=io.BytesIO(
'{{"logfilepath": "{logpath}"}}'.format(
logpath=normalize_path_for_json(tmpdir),
).encode("utf-8")
),
):
stacktrace_file = tmpdir.join("sg_stack_trace.log")
stacktrace_file.write("foo")
tasks = sgcollect.make_collect_logs_tasks(
sg_url="fakeurl",
sg_config_file_path="",
auth_headers={},
)
assert [tasks[0].log_file] == [stacktrace_file.basename]
# ensure that this is not redacted task
assert tasks[0].description.startswith("Contents of")


@pytest.mark.parametrize("should_redact", [True, False])
def test_make_collect_logs_tasks_duplicate_files(should_redact, tmp_path):
tmpdir1 = tmp_path / "tmpdir1"
Expand Down
Loading
Loading