Skip to content
44 changes: 44 additions & 0 deletions base/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ import (
"net"
"net/http"
"net/url"
"os"
"path/filepath"
"reflect"
"regexp"
"runtime"
Expand Down Expand Up @@ -1831,3 +1833,45 @@ func IsRevTreeID(s string) bool {
}
return false
}

// GetStackTrace will return goroutine stack traces for all goroutines in Sync Gateway.
func GetStackTrace() string {
// make 1MB buffer but if this buffer isn't big enough, runtime.Stack will
// return nothing, thus have 5 retires doubling the capacity each time.
buf := make([]byte, 1<<20)
for range 5 {
n := runtime.Stack(buf, true)
if n < len(buf) {
buf = buf[:n]
break
}
buf = make([]byte, 2*len(buf))
}
return string(buf)
}

// RotateProfilesIfNeeded will remove old files if there are more than
// 10 matching the given filename pattern.
func RotateProfilesIfNeeded(filename string) error {
existingFiles, err := filepath.Glob(filename)
if err != nil {
return fmt.Errorf("Error listing existing profiles in %q: %w", filename, err)
}
if len(existingFiles) <= 10 {
return nil
}
slices.Reverse(existingFiles)
var multiErr *MultiError
for _, profile := range existingFiles[10:] {
err = os.Remove(profile)
if err != nil {
multiErr = multiErr.Append(fmt.Errorf("Error removing old profile %q: %w", profile, err))
}
}
return multiErr.ErrorOrNil()
}

// CreateFileInDirectory will create a file in directory specified by filename.
func CreateFileInDirectory(filename string) (*os.File, error) {
return os.Create(filename)
}
11 changes: 11 additions & 0 deletions rest/adminapitest/admin_api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2269,6 +2269,17 @@ func TestHandleSGCollect(t *testing.T) {
rest.RequireStatus(t, resp, http.StatusBadRequest)
}

func TestHandleGetStackTrace(t *testing.T) {
rt := rest.NewRestTester(t, nil)
defer rt.Close()

resp := rt.SendAdminRequest(http.MethodGet, "/_debug/stacktrace", "")
rest.RequireStatus(t, resp, http.StatusOK)
rawResponseStr := resp.Body.String()
assert.Contains(t, rawResponseStr, "goroutine")
assert.Contains(t, rawResponseStr, "handleCollectStackTrace")
}

func TestConfigRedaction(t *testing.T) {
base.LongRunningTest(t)

Expand Down
8 changes: 8 additions & 0 deletions rest/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,14 @@ func (h *handler) handleFgprof() error {
return stopFn()
}

func (h *handler) handleCollectStackTrace() error {

stackTrace := base.GetStackTrace()

h.writeText([]byte(stackTrace))
return nil
}

func (h *handler) handlePprofBlock() error {
sec, err := strconv.ParseInt(h.rq.FormValue("seconds"), 10, 64)
if sec <= 0 || err != nil {
Expand Down
2 changes: 2 additions & 0 deletions rest/routing.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,8 @@ func CreateAdminRouter(sc *ServerContext) *mux.Router {
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePprofTrace, handlerOptions{sgcollect: true})).Methods("GET", "POST")
r.Handle("/_debug/fgprof",
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleFgprof, handlerOptions{sgcollect: true})).Methods("GET", "POST")
r.Handle("/_debug/stacktrace",
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleCollectStackTrace, handlerOptions{sgcollect: true})).Methods("GET")

r.Handle("/_post_upgrade",
makeHandler(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePostUpgrade)).Methods("POST")
Expand Down
61 changes: 61 additions & 0 deletions rest/server_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@
"net"
"net/http"
"os"
"os/signal"
"path/filepath"
"slices"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"

"github.com/KimMachineGun/automemlimit/memlimit"
Expand Down Expand Up @@ -59,6 +62,8 @@

var allServers = []serverType{publicServer, adminServer, metricsServer, diagnosticServer}

const stackFilePrefix = "sg_stack_trace_"

// serverInfo represents an instance of an HTTP server from sync gateway
type serverInfo struct {
server *http.Server // server is the HTTP server instance
Expand Down Expand Up @@ -204,9 +209,33 @@

sc.startStatsLogger(ctx)

sc.registerSignalHandlerForStackTrace(ctx)

return sc
}

// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces
// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines.
func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) {
signalChannel := make(chan os.Signal, 1)
signal.Notify(signalChannel, syscall.SIGUSR1)

Check failure on line 221 in rest/server_context.go

View workflow job for this annotation

GitHub Actions / test (windows)

undefined: syscall.SIGUSR1

Check failure on line 221 in rest/server_context.go

View workflow job for this annotation

GitHub Actions / test (windows)

undefined: syscall.SIGUSR1

go func() {
for sig := range signalChannel {
base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig)
switch sig {
case syscall.SIGUSR1:

Check failure on line 227 in rest/server_context.go

View workflow job for this annotation

GitHub Actions / test (windows)

undefined: syscall.SIGUSR1

Check failure on line 227 in rest/server_context.go

View workflow job for this annotation

GitHub Actions / test (windows)

undefined: syscall.SIGUSR1
// stack trace signal received
currentTime := time.Now()
timestamp := currentTime.Format(time.RFC3339)
sc.logStackTraces(ctx, timestamp)
default:
// unhandled signal here
}
}
}()
}

func (sc *ServerContext) WaitForRESTAPIs(ctx context.Context) error {
timeout := 30 * time.Second
interval := time.Millisecond * 100
Expand Down Expand Up @@ -1844,6 +1873,38 @@

}

func (sc *ServerContext) logStackTraces(ctx context.Context, timestamp string) {

base.InfofCtx(ctx, base.KeyAll, "Collecting stack trace for all goroutines")
stackTrace := base.GetStackTrace()

// log to console
_, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace)

filename := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+timestamp+".log")
file, err := base.CreateFileInDirectory(filename)
defer func() {
err = file.Close()
if err != nil {
base.WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, err)
}
}()
if err != nil {
base.DebugfCtx(ctx, base.KeyAll, "Error opening stack trace file %s: %v", filename, err)
}

_, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace))
if err != nil {
base.DebugfCtx(ctx, base.KeyAll, "Error writing stack trace to file %s: %v", filename, err)
}

rotatePath := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+"*.log")
err = base.RotateProfilesIfNeeded(rotatePath)
if err != nil {
base.DebugfCtx(ctx, base.KeyAll, "Error rotating stack trace files in path %s: %v", rotatePath, err)
}
}

func (sc *ServerContext) logNetworkInterfaceStats(ctx context.Context) {

if err := sc.statsContext.addPublicNetworkInterfaceStatsForHostnamePort(sc.Config.API.PublicInterface); err != nil {
Expand Down
28 changes: 28 additions & 0 deletions rest/server_context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"net/http"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -1062,3 +1063,30 @@ func TestDatabaseCollectionDeletedErrorState(t *testing.T) {
allDbs = rt.ServerContext().allDatabaseSummaries()
require.Len(t, allDbs, 1)
}

func TestCollectStackTraceFile(t *testing.T) {
base.SetUpTestLogging(t, base.LevelInfo, base.KeyAll)

tempPath := t.TempDir()
serverConfig := DefaultStartupConfig(tempPath)
serverConfig.Logging.LogFilePath = tempPath
ctx := t.Context()
serverContext := NewServerContext(ctx, &serverConfig, false)
defer serverContext.Close(ctx)

timeStamp := "01"
serverContext.logStackTraces(ctx, timeStamp)
require.Len(t, getFilenames(t, tempPath), 1)
slices.Contains(getFilenames(t, tempPath), stackFilePrefix+timeStamp+".log")

// trigger rotation and assert we don't go above 10 files
expectedFiles := make([]string, 0, 10)
for i := 2; i < 12; i++ {
timeStamp = fmt.Sprintf("%d", i+2)
serverContext.logStackTraces(ctx, timeStamp)
expectedFiles = append(expectedFiles, stackFilePrefix+timeStamp+".log")
}
files := getFilenames(t, tempPath)
require.Len(t, files, 10)
require.ElementsMatch(t, files, expectedFiles)
}
23 changes: 5 additions & 18 deletions rest/stats_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"path/filepath"
"runtime"
"runtime/pprof"
"slices"
"sync/atomic"
"time"

Expand Down Expand Up @@ -393,7 +392,7 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp

memoryProfile := pprof.Lookup("heap")
filename := filepath.Join(outputDir, pprofPrefix+timestamp+".pb.gz")
file, err := os.Create(filename)
file, err := base.CreateFileInDirectory(filename)
defer func() {
err = file.Close()
if err != nil {
Expand All @@ -407,20 +406,8 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp
if err != nil {
return fmt.Errorf("Error writing memory profile to %q: %w", filename, err)
}
existingProfiles, err := filepath.Glob(filepath.Join(outputDir, pprofPrefix+"*.pb.gz"))
if err != nil {
return fmt.Errorf("Error listing existing memory profiles in %q: %w", outputDir, err)
}
if len(existingProfiles) <= 10 {
return nil
}
slices.Reverse(existingProfiles)
var multiErr *base.MultiError
for _, profile := range existingProfiles[10:] {
err = os.Remove(profile)
if err != nil {
multiErr = multiErr.Append(fmt.Errorf("Error removing old memory profile %q: %w", profile, err))
}
}
return multiErr.ErrorOrNil()

// rotate old profiles
path := filepath.Join(outputDir, pprofPrefix+"*.pb.gz")
return base.RotateProfilesIfNeeded(path)
}
21 changes: 21 additions & 0 deletions tools-tests/sgcollect_info_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ def test_make_collect_logs_heap_profile(tmpdir):
assert tasks[0].description.startswith("Contents of")


def test_make_collect_logs_stacktrace(tmpdir):
with unittest.mock.patch(
"sgcollect.urlopen",
return_value=io.BytesIO(
'{{"logfilepath": "{logpath}"}}'.format(
logpath=normalize_path_for_json(tmpdir),
).encode("utf-8")
),
):
stacktrace_file = tmpdir.join("sg_stack_trace.log")
stacktrace_file.write("foo")
tasks = sgcollect.make_collect_logs_tasks(
sg_url="fakeurl",
sg_config_file_path="",
auth_headers={},
)
assert [tasks[0].log_file] == [stacktrace_file.basename]
# ensure that this is not redacted task
assert tasks[0].description.startswith("Contents of")


@pytest.mark.parametrize("should_redact", [True, False])
def test_make_collect_logs_tasks_duplicate_files(should_redact, tmp_path):
tmpdir1 = tmp_path / "tmpdir1"
Expand Down
25 changes: 25 additions & 0 deletions tools/sgcollect.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,25 @@ def make_http_client_pprof_tasks(
return pprof_tasks


def make_http_client_stack_trace_task(
sg_url: str, auth_headers: dict[str, str]
) -> PythonTask:
"""
This task uses the python http client to collect the raw stack trace data
"""
stack_trace_url = "{0}/_debug/stacktrace".format(sg_url)

stack_trace_task = make_curl_task(
name="Collect stack trace via http client",
auth_headers=auth_headers,
url=stack_trace_url,
log_file="sg_stack_trace.log",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
log_file="sg_stack_trace.log",
log_file="goroutines.log",

I don't care what we call this but sg_stack_trace.log will mean that this output overwrites the output from the signal handler, so it needs to be a different name.

Basically this is just a plain text representation of pprof_goroutines.pb.gz so I think that would be a reasonable name.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will alter the name for all the files to make it goroutine specific but this doesn't overwrite the signal files, the signal files will have timestamp in the file name, sgcollect http call will not.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, that's fine then.

)
stack_trace_task.no_header = True

return stack_trace_task


def to_lower_case_keys_dict(original_dict):
result = {}
for k, v in list(original_dict.items()):
Expand Down Expand Up @@ -367,6 +386,7 @@ def make_collect_logs_tasks(
"sg_debug.log": "sg_debug.log",
"sg_trace.log": "sg_trace.log",
"sg_stats.log": "sg_stats.log",
"sg_stack_trace.log": "sg_stack_trace.log",
"sync_gateway_access.log": "sync_gateway_access.log",
"sync_gateway_error.log": "sync_gateway_error.log",
"pprof.pb": "pprof.pb",
Expand Down Expand Up @@ -761,6 +781,10 @@ def make_sg_tasks(

http_client_pprof_tasks = make_http_client_pprof_tasks(sg_url, auth_headers)

http_client_stack_trace_task = make_http_client_stack_trace_task(
sg_url, auth_headers
)

# Add a task to collect Sync Gateway config
config_tasks = make_config_tasks(
sg_config_path, sg_url, auth_headers, should_redact
Expand All @@ -781,6 +805,7 @@ def make_sg_tasks(
collect_logs_tasks,
py_expvar_task,
http_client_pprof_tasks,
http_client_stack_trace_task,
config_tasks,
status_tasks,
]
Expand Down
Loading