Skip to content

Commit ce80821

Browse files
committed
CBG-4714: collect goroutine stack traces on signal
1 parent 2d69700 commit ce80821

File tree

9 files changed

+206
-18
lines changed

9 files changed

+206
-18
lines changed

base/util.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ import (
2727
"net"
2828
"net/http"
2929
"net/url"
30+
"os"
31+
"path/filepath"
3032
"reflect"
3133
"regexp"
3234
"runtime"
@@ -1831,3 +1833,44 @@ func IsRevTreeID(s string) bool {
18311833
}
18321834
return false
18331835
}
1836+
1837+
func GetStackTrace() string {
1838+
// this nees logging in to warmn if we don;t successfully grab the stack in 10 tries
1839+
// mnaybe use some switch retyr logic
1840+
buf := make([]byte, 1<<20)
1841+
count := 0
1842+
for count < 10 {
1843+
n := runtime.Stack(buf, true)
1844+
fmt.Println("n", n)
1845+
if n < len(buf) {
1846+
buf = buf[:n]
1847+
break
1848+
}
1849+
buf = make([]byte, 2*len(buf))
1850+
count++
1851+
}
1852+
return string(buf)
1853+
}
1854+
1855+
func RotateProfilesIfNeeded(filename string) error {
1856+
existingFiles, err := filepath.Glob(filename)
1857+
if err != nil {
1858+
return fmt.Errorf("Error listing existing profiles in %q: %w", filename, err)
1859+
}
1860+
if len(existingFiles) <= 10 {
1861+
return nil
1862+
}
1863+
slices.Reverse(existingFiles)
1864+
var multiErr *MultiError
1865+
for _, profile := range existingFiles[10:] {
1866+
err = os.Remove(profile)
1867+
if err != nil {
1868+
multiErr = multiErr.Append(fmt.Errorf("Error removing old profile %q: %w", profile, err))
1869+
}
1870+
}
1871+
return multiErr.ErrorOrNil()
1872+
}
1873+
1874+
func CreateFileInLoggingDirectory(filename string) (*os.File, error) {
1875+
return os.Create(filename)
1876+
}

rest/adminapitest/admin_api_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2269,6 +2269,17 @@ func TestHandleSGCollect(t *testing.T) {
22692269
rest.RequireStatus(t, resp, http.StatusBadRequest)
22702270
}
22712271

2272+
func TestHandleGetStackTrace(t *testing.T) {
2273+
rt := rest.NewRestTester(t, nil)
2274+
defer rt.Close()
2275+
2276+
resp := rt.SendAdminRequest(http.MethodGet, "/_debug/stacktrace", "")
2277+
rest.RequireStatus(t, resp, http.StatusOK)
2278+
rawResponseStr := resp.Body.String()
2279+
assert.Contains(t, rawResponseStr, "goroutine")
2280+
assert.Contains(t, rawResponseStr, "handleCollectStackTrace")
2281+
}
2282+
22722283
func TestConfigRedaction(t *testing.T) {
22732284
base.LongRunningTest(t)
22742285

rest/api.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,14 @@ func (h *handler) handleFgprof() error {
693693
return stopFn()
694694
}
695695

696+
func (h *handler) handleCollectStackTrace() error {
697+
698+
stackTrace := base.GetStackTrace()
699+
700+
h.writeText([]byte(stackTrace))
701+
return nil
702+
}
703+
696704
func (h *handler) handlePprofBlock() error {
697705
sec, err := strconv.ParseInt(h.rq.FormValue("seconds"), 10, 64)
698706
if sec <= 0 || err != nil {

rest/routing.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,8 @@ func CreateAdminRouter(sc *ServerContext) *mux.Router {
316316
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePprofTrace, handlerOptions{sgcollect: true})).Methods("GET", "POST")
317317
r.Handle("/_debug/fgprof",
318318
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleFgprof, handlerOptions{sgcollect: true})).Methods("GET", "POST")
319+
r.Handle("/_debug/stacktrace",
320+
makeHandlerWithOptions(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handleCollectStackTrace, handlerOptions{sgcollect: true})).Methods("GET")
319321

320322
r.Handle("/_post_upgrade",
321323
makeHandler(sc, adminPrivs, []Permission{PermDevOps}, nil, (*handler).handlePostUpgrade)).Methods("POST")

rest/server_context.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,15 @@ import (
2020
"net"
2121
"net/http"
2222
"os"
23+
"os/signal"
24+
"path/filepath"
2325
"slices"
2426
"sort"
2527
"strconv"
2628
"strings"
2729
"sync"
2830
"sync/atomic"
31+
"syscall"
2932
"time"
3033

3134
"github.com/KimMachineGun/automemlimit/memlimit"
@@ -59,6 +62,8 @@ var ErrSuspendingDisallowed = errors.New("database does not allow suspending")
5962

6063
var allServers = []serverType{publicServer, adminServer, metricsServer, diagnosticServer}
6164

65+
const stackFilePrefix = "sg_stack_trace_"
66+
6267
// serverInfo represents an instance of an HTTP server from sync gateway
6368
type serverInfo struct {
6469
server *http.Server // server is the HTTP server instance
@@ -204,9 +209,33 @@ func NewServerContext(ctx context.Context, config *StartupConfig, persistentConf
204209

205210
sc.startStatsLogger(ctx)
206211

212+
sc.registerSignalHandlerForStackTrace(ctx)
213+
207214
return sc
208215
}
209216

217+
// registerSignalHandlerForStackTrace will register a signal handler to capture stack traces
218+
// - SIGUSR1 causes Sync Gateway to record a stack trace of all running goroutines.
219+
func (sc *ServerContext) registerSignalHandlerForStackTrace(ctx context.Context) {
220+
signalChannel := make(chan os.Signal, 1)
221+
signal.Notify(signalChannel, syscall.SIGUSR1)
222+
223+
go func() {
224+
for sig := range signalChannel {
225+
base.InfofCtx(ctx, base.KeyAll, "Handling signal: %v", sig)
226+
switch sig {
227+
case syscall.SIGUSR1:
228+
// stack trace signal received
229+
currentTime := time.Now()
230+
timestamp := currentTime.Format(time.RFC3339)
231+
sc.logStackTraces(ctx, timestamp)
232+
default:
233+
// unhandled signal here
234+
}
235+
}
236+
}()
237+
}
238+
210239
func (sc *ServerContext) WaitForRESTAPIs(ctx context.Context) error {
211240
timeout := 30 * time.Second
212241
interval := time.Millisecond * 100
@@ -1844,6 +1873,38 @@ func (sc *ServerContext) logStats(ctx context.Context) error {
18441873

18451874
}
18461875

1876+
func (sc *ServerContext) logStackTraces(ctx context.Context, timestamp string) {
1877+
1878+
base.InfofCtx(ctx, base.KeyAll, "Collecting stack trace for all goroutines")
1879+
stackTrace := base.GetStackTrace()
1880+
1881+
// log to console
1882+
_, _ = fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", stackTrace)
1883+
1884+
filename := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+timestamp+".log")
1885+
file, err := base.CreateFileInLoggingDirectory(filename)
1886+
defer func() {
1887+
err = file.Close()
1888+
if err != nil {
1889+
base.WarnfCtx(ctx, "Error closing stack trace file %s: %v", filename, err)
1890+
}
1891+
}()
1892+
if err != nil {
1893+
base.DebugfCtx(ctx, base.KeyAll, "Error opening stack trace file %s: %v", filename, err)
1894+
}
1895+
1896+
_, err = file.WriteString(fmt.Sprintf("Stack trace:\n%s\n", stackTrace))
1897+
if err != nil {
1898+
base.DebugfCtx(ctx, base.KeyAll, "Error writing stack trace to file %s: %v", filename, err)
1899+
}
1900+
1901+
rotatePath := filepath.Join(sc.Config.Logging.LogFilePath, stackFilePrefix+"*.log")
1902+
err = base.RotateProfilesIfNeeded(rotatePath)
1903+
if err != nil {
1904+
base.DebugfCtx(ctx, base.KeyAll, "Error rotating stack trace files in path %s: %v", rotatePath, err)
1905+
}
1906+
}
1907+
18471908
func (sc *ServerContext) logNetworkInterfaceStats(ctx context.Context) {
18481909

18491910
if err := sc.statsContext.addPublicNetworkInterfaceStatsForHostnamePort(sc.Config.API.PublicInterface); err != nil {

rest/server_context_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"net/http"
1717
"os"
1818
"path/filepath"
19+
"slices"
1920
"strconv"
2021
"strings"
2122
"sync"
@@ -1062,3 +1063,30 @@ func TestDatabaseCollectionDeletedErrorState(t *testing.T) {
10621063
allDbs = rt.ServerContext().allDatabaseSummaries()
10631064
require.Len(t, allDbs, 1)
10641065
}
1066+
1067+
func TestCollectStackTraceFile(t *testing.T) {
1068+
base.SetUpTestLogging(t, base.LevelInfo, base.KeyAll)
1069+
1070+
tempPath := t.TempDir()
1071+
serverConfig := DefaultStartupConfig(tempPath)
1072+
serverConfig.Logging.LogFilePath = tempPath
1073+
ctx := t.Context()
1074+
serverContext := NewServerContext(ctx, &serverConfig, false)
1075+
defer serverContext.Close(ctx)
1076+
1077+
timeStamp := "01"
1078+
serverContext.logStackTraces(ctx, timeStamp)
1079+
require.Len(t, getFilenames(t, tempPath), 1)
1080+
slices.Contains(getFilenames(t, tempPath), stackFilePrefix+timeStamp+".log")
1081+
1082+
// trigger rotation and assert we don't go above 10 files
1083+
expectedFiles := make([]string, 0, 10)
1084+
for i := 2; i < 12; i++ {
1085+
timeStamp = fmt.Sprintf("%d", i+2)
1086+
serverContext.logStackTraces(ctx, timeStamp)
1087+
expectedFiles = append(expectedFiles, stackFilePrefix+timeStamp+".log")
1088+
}
1089+
files := getFilenames(t, tempPath)
1090+
require.Len(t, files, 10)
1091+
require.ElementsMatch(t, files, expectedFiles)
1092+
}

rest/stats_context.go

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import (
1818
"path/filepath"
1919
"runtime"
2020
"runtime/pprof"
21-
"slices"
2221
"sync/atomic"
2322
"time"
2423

@@ -393,7 +392,7 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp
393392

394393
memoryProfile := pprof.Lookup("heap")
395394
filename := filepath.Join(outputDir, pprofPrefix+timestamp+".pb.gz")
396-
file, err := os.Create(filename)
395+
file, err := base.CreateFileInLoggingDirectory(filename)
397396
defer func() {
398397
err = file.Close()
399398
if err != nil {
@@ -407,20 +406,8 @@ func (statsContext *statsContext) collectMemoryProfile(ctx context.Context, outp
407406
if err != nil {
408407
return fmt.Errorf("Error writing memory profile to %q: %w", filename, err)
409408
}
410-
existingProfiles, err := filepath.Glob(filepath.Join(outputDir, pprofPrefix+"*.pb.gz"))
411-
if err != nil {
412-
return fmt.Errorf("Error listing existing memory profiles in %q: %w", outputDir, err)
413-
}
414-
if len(existingProfiles) <= 10 {
415-
return nil
416-
}
417-
slices.Reverse(existingProfiles)
418-
var multiErr *base.MultiError
419-
for _, profile := range existingProfiles[10:] {
420-
err = os.Remove(profile)
421-
if err != nil {
422-
multiErr = multiErr.Append(fmt.Errorf("Error removing old memory profile %q: %w", profile, err))
423-
}
424-
}
425-
return multiErr.ErrorOrNil()
409+
410+
// rotate old profiles
411+
path := filepath.Join(outputDir, pprofPrefix+"*.pb.gz")
412+
return base.RotateProfilesIfNeeded(path)
426413
}

tools-tests/sgcollect_info_test.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,28 @@ def test_make_collect_logs_heap_profile(tmpdir):
7272
assert tasks[0].description.startswith("Contents of")
7373

7474

75+
def test_make_collect_logs_stacktrace(tmpdir):
76+
with unittest.mock.patch(
77+
"sgcollect.urlopen",
78+
return_value=io.BytesIO(
79+
'{{"logfilepath": "{logpath}"}}'.format(
80+
logpath=normalize_path_for_json(tmpdir),
81+
).encode("utf-8")
82+
),
83+
):
84+
stacktrace_file = tmpdir.join("sg_stack_trace.log")
85+
stacktrace_file.write("foo")
86+
tasks = sgcollect.make_collect_logs_tasks(
87+
sg_url="fakeurl",
88+
sg_config_file_path="",
89+
auth_headers={},
90+
)
91+
assert [tasks[0].log_file] == [stacktrace_file.basename]
92+
# ensure that this is not redacted task
93+
assert tasks[0].description.startswith("Contents of")
94+
95+
96+
7597
@pytest.mark.parametrize("should_redact", [True, False])
7698
def test_make_collect_logs_tasks_duplicate_files(should_redact, tmp_path):
7799
tmpdir1 = tmp_path / "tmpdir1"

tools/sgcollect.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,26 @@ def make_http_client_pprof_tasks(
278278

279279
return pprof_tasks
280280

281+
def make_http_client_stack_trace_task(
282+
sg_url: str, auth_headers: dict[str, str]
283+
) -> PythonTask:
284+
"""
285+
This task uses the python http client to collect the raw stack trace data
286+
"""
287+
stack_trace_url = "{0}/_debug/stacktrace".format(sg_url)
288+
289+
stack_trace_task = make_curl_task(
290+
name="Collect stack trace via http client",
291+
auth_headers=auth_headers,
292+
url=stack_trace_url,
293+
log_file="sg_stack_trace.log",
294+
)
295+
stack_trace_task.no_header = True
296+
297+
return stack_trace_task
298+
299+
300+
281301

282302
def to_lower_case_keys_dict(original_dict):
283303
result = {}
@@ -367,6 +387,7 @@ def make_collect_logs_tasks(
367387
"sg_debug.log": "sg_debug.log",
368388
"sg_trace.log": "sg_trace.log",
369389
"sg_stats.log": "sg_stats.log",
390+
#"sg_stack_trace.log": "sg_stack_trace.log",
370391
"sync_gateway_access.log": "sync_gateway_access.log",
371392
"sync_gateway_error.log": "sync_gateway_error.log",
372393
"pprof.pb": "pprof.pb",
@@ -761,6 +782,10 @@ def make_sg_tasks(
761782

762783
http_client_pprof_tasks = make_http_client_pprof_tasks(sg_url, auth_headers)
763784

785+
http_client_stack_trace_task = make_http_client_stack_trace_task(
786+
sg_url, auth_headers
787+
)
788+
764789
# Add a task to collect Sync Gateway config
765790
config_tasks = make_config_tasks(
766791
sg_config_path, sg_url, auth_headers, should_redact
@@ -781,6 +806,7 @@ def make_sg_tasks(
781806
collect_logs_tasks,
782807
py_expvar_task,
783808
http_client_pprof_tasks,
809+
http_client_stack_trace_task,
784810
config_tasks,
785811
status_tasks,
786812
]

0 commit comments

Comments
 (0)