Skip to content

Commit 8977c22

Browse files
craig[bot]jeffswenson
andcommitted
Merge #152212
152212: cloud: add optional http request logging r=jeffswenson a=jeffswenson This adds optional http level logging to the cloud package. The logs can be enabled using "--vmodule=cloud_logging_transport=1". Here is an example log line: ``` GET ‹https://cockroach-fixtures-us-east-2.s3.us-east-2.amazonaws.com/roachtest/master/tpcc-5k/20250820-055653.397/2025/08/20-061143.99/data/1099694952495284225.sst?x-id=GetObject› (206 Partial Content) sent=0,recv=(143/2787880),request_latency=31.38703ms,read_latency=9.694µs: closed early ``` The immediate motivation of these logs is to troubleshoot roachtests. But recent experience has shown that client request logs may be essential when trying to troubleshoot unexpected cloud provider issues. Informs: #151982 Release note: none Co-authored-by: Jeff Swenson <[email protected]>
2 parents 921d818 + 25853eb commit 8977c22

File tree

8 files changed

+284
-8
lines changed

8 files changed

+284
-8
lines changed

pkg/cloud/BUILD.bazel

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go_library(
44
name = "cloud",
55
srcs = [
66
"cloud_io.go",
7+
"cloud_logging_transport.go",
78
"external_storage.go",
89
"impl_registry.go",
910
"kms.go",
@@ -31,7 +32,9 @@ go_library(
3132
"//pkg/util/retry",
3233
"//pkg/util/sysutil",
3334
"//pkg/util/tracing",
35+
"@com_github_cockroachdb_crlib//crtime",
3436
"@com_github_cockroachdb_errors//:errors",
37+
"@com_github_cockroachdb_redact//:redact",
3538
"@com_github_prometheus_client_model//go",
3639
"@com_github_stretchr_testify//require",
3740
],
@@ -41,14 +44,17 @@ go_test(
4144
name = "cloud_test",
4245
srcs = [
4346
"cloud_io_test.go",
47+
"cloud_logging_transport_test.go",
4448
"options_test.go",
4549
"uris_test.go",
4650
],
4751
embed = [":cloud"],
4852
deps = [
4953
"//pkg/cloud/cloudpb",
5054
"//pkg/util/ioctx",
55+
"//pkg/util/log",
5156
"@com_github_cockroachdb_errors//:errors",
57+
"@com_github_cockroachdb_redact//:redact",
5258
"@com_github_stretchr_testify//require",
5359
],
5460
)

pkg/cloud/cloud_io.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ func MakeHTTPClient(
100100
if err != nil {
101101
return nil, err
102102
}
103-
return MakeHTTPClientForTransport(t)
103+
return MakeHTTPClientForTransport(maybeAddLogging(t))
104104
}
105105

106106
// MakeHTTPClientForTransport creates a new http.Client with the given
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package cloud
7+
8+
import (
9+
"context"
10+
"io"
11+
"net/http"
12+
"sync"
13+
"sync/atomic"
14+
"time"
15+
16+
"github.com/cockroachdb/cockroach/pkg/util/log"
17+
"github.com/cockroachdb/cockroach/pkg/util/tracing"
18+
"github.com/cockroachdb/crlib/crtime"
19+
"github.com/cockroachdb/redact"
20+
)
21+
22+
// maybeAddLogging wraps the provided http.RoundTripper with a logging
23+
// transport if verbose logging is enabled.
24+
func maybeAddLogging(inner http.RoundTripper) http.RoundTripper {
25+
if log.V(1) {
26+
return &loggingTransport{inner: inner}
27+
}
28+
return inner
29+
}
30+
31+
type loggingTransport struct {
32+
inner http.RoundTripper
33+
}
34+
35+
// RoundTrip implements http.RoundTripper.
36+
func (l *loggingTransport) RoundTrip(request *http.Request) (*http.Response, error) {
37+
38+
// NOTE: Body can be nil if the request has no body.
39+
requestWatcher := &requestBodyTracker{inner: request.Body}
40+
if request.Body != nil {
41+
// Make a shallow clone of the request because the request object itself is
42+
// supposed to be immutable.
43+
request = request.Clone(request.Context())
44+
request.Body = requestWatcher
45+
}
46+
47+
now := crtime.NowMono()
48+
resp, err := l.inner.RoundTrip(request)
49+
if err != nil {
50+
log.Dev.Warningf(request.Context(), "%s %s: %v", redact.SafeString(request.Method), request.URL.String(), err)
51+
return resp, err
52+
}
53+
54+
logCtx, span := tracing.ForkSpan(request.Context(), "cloud-logging-transport")
55+
56+
resp.Body = &responseBodyTracker{
57+
inner: resp.Body,
58+
ctx: logCtx,
59+
span: span,
60+
61+
status: redact.SafeString(resp.Status),
62+
method: redact.SafeString(request.Method),
63+
url: request.URL.String(),
64+
requestLatency: now.Elapsed(),
65+
requestBytes: requestWatcher.readBytes.Load(),
66+
responseBytes: resp.ContentLength,
67+
}
68+
return resp, nil
69+
}
70+
71+
var _ http.RoundTripper = &loggingTransport{}
72+
73+
// requestBodyTracker is a wrapper around io.ReadCloser that tracks the number of
74+
// bytes read from the request body.
75+
type requestBodyTracker struct {
76+
inner io.ReadCloser
77+
// readBytes is the number of bytes returned by the underlying Read calls.
78+
// NOTE(jeffswenson): in practice, I don't think this actually needs to be an
79+
// atomic value. By the time we observe the value, the request body should be
80+
// fully consumed. But syscalls in golang are not defined to be memory
81+
// barriers. So from the perspective of the go race detector there is no
82+
// synchronization between the goroutine reading the request body and the
83+
// goroutine sending the request.
84+
readBytes atomic.Int64
85+
}
86+
87+
func (s *requestBodyTracker) Read(p []byte) (int, error) {
88+
n, err := s.inner.Read(p)
89+
s.readBytes.Add(int64(n))
90+
return n, err
91+
}
92+
93+
func (s *requestBodyTracker) Close() error {
94+
return s.inner.Close()
95+
}
96+
97+
// responseBodyTracker is an io.ReadCloser that wraps the original response
98+
// body. It counts the number of bytes read from the response body and logs
99+
// stats about the request when the body is closed.
100+
type responseBodyTracker struct {
101+
inner io.ReadCloser
102+
ctx context.Context
103+
span *tracing.Span
104+
105+
// status is the HTTP status code of the response. (e.g. "200 OK", "404 Not Found").
106+
status redact.SafeString
107+
// method is the HTTP method of the request (e.g., GET, POST).
108+
method redact.SafeString
109+
// url is the URL of the request.
110+
url string
111+
// requestLatency is the amount of time spent waiting for the response header.
112+
requestLatency time.Duration
113+
// requestBytes is the number of bytes sent in the request body.
114+
requestBytes int64
115+
// number of bytes in the response body. We often do not read the entire
116+
// body.
117+
responseBytes int64
118+
119+
// readErr is the error returned by the last Read() call on the response body. This is
120+
// expected to be io.EOF when the body is fully read, but may also be `nil` if we are closing
121+
// an incomplete response body or some network error that occurred after receiving the response
122+
// header but before reading the whole body.
123+
readErr error
124+
// readBytes is the number of bytes read from the response body. This is tracked
125+
// because we often close a request before reading the entire body.
126+
readBytes int64
127+
// readTime is the amount of time spent waiting in Read() calls.
128+
readTime time.Duration
129+
130+
// closeOnce ensures Close() logic is only called once. Some callers of
131+
// Close() are sloppy and call it multiple times. The double log lines are
132+
// annoying, but the real issue is the *tracing.Span. The tracing
133+
// infrastructure reuses memory internally, so it is essential that
134+
// span.Finish is called exactly once.
135+
closeOnce sync.Once
136+
}
137+
138+
func (l *responseBodyTracker) Read(p []byte) (int, error) {
139+
start := crtime.NowMono()
140+
141+
n, err := l.inner.Read(p)
142+
143+
l.readBytes += int64(n)
144+
l.readTime += start.Elapsed()
145+
l.readErr = err
146+
return n, err
147+
}
148+
149+
func (l *responseBodyTracker) Close() error {
150+
l.closeOnce.Do(func() {
151+
if l.readErr == io.EOF || (l.readErr == nil && l.readBytes == l.responseBytes) {
152+
log.Dev.Infof(l.ctx, "%s %s (%s) sent=%d,recv=%d,request_latency=%s,read_latency=%s",
153+
l.method, l.url, l.status,
154+
l.requestBytes, l.readBytes, l.requestLatency, l.readTime)
155+
} else if l.readErr != nil {
156+
log.Dev.Warningf(l.ctx, "%s %s (%s) sent=%d,recv=(%d/%d),request_latency=%s,read_latency=%s: %v",
157+
l.method, l.url, l.status,
158+
l.requestBytes, l.readBytes, l.responseBytes, l.requestLatency, l.readTime, l.readErr)
159+
} else {
160+
log.Dev.Infof(l.ctx, "%s %s (%s) sent=%d,recv=(%d/%d),request_latency=%s,read_latency=%s: closed early",
161+
l.method, l.url, l.status,
162+
l.requestBytes, l.readBytes, l.responseBytes, l.requestLatency, l.readTime)
163+
}
164+
l.span.Finish()
165+
})
166+
return l.inner.Close()
167+
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package cloud
7+
8+
import (
9+
"context"
10+
"io"
11+
"net/http"
12+
"net/http/httptest"
13+
"strings"
14+
"testing"
15+
16+
"github.com/cockroachdb/cockroach/pkg/util/log"
17+
"github.com/cockroachdb/redact"
18+
"github.com/stretchr/testify/require"
19+
)
20+
21+
func TestMaybeAddLogging(t *testing.T) {
22+
// Test that logging is only added when log.V(1) is true
23+
originalVModule := log.GetVModule()
24+
defer func() { _ = log.SetVModule(originalVModule) }()
25+
26+
t.Run("logging-disabled", func(t *testing.T) {
27+
_ = log.SetVModule("")
28+
inner := &http.Transport{}
29+
result := maybeAddLogging(inner)
30+
require.Equal(t, inner, result, "should return inner transport when logging is disabled")
31+
})
32+
33+
t.Run("logging-enabled", func(t *testing.T) {
34+
_ = log.SetVModule("*=1")
35+
inner := &http.Transport{}
36+
result := maybeAddLogging(inner)
37+
require.NotEqual(t, inner, result, "should return wrapped transport when logging is enabled")
38+
39+
loggingTransport, ok := result.(*loggingTransport)
40+
require.True(t, ok, "should return loggingTransport type")
41+
require.Equal(t, inner, loggingTransport.inner)
42+
})
43+
}
44+
45+
func TestCloudLoggingTransport(t *testing.T) {
46+
// Set up logging scope for testing
47+
sc := log.ScopeWithoutShowLogs(t)
48+
defer sc.Close(t)
49+
50+
ctx := context.Background()
51+
52+
// Create a test server that echoes back the request body
53+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
54+
// Read the request body to test request body tracking
55+
reqBody, err := io.ReadAll(r.Body)
56+
if err != nil {
57+
t.Errorf("failed to read request body: %v", err)
58+
http.Error(w, "failed to read body", http.StatusBadRequest)
59+
return
60+
}
61+
r.Body.Close()
62+
63+
// Echo back the request body
64+
w.Header().Set("Content-Type", "text/plain")
65+
w.WriteHeader(http.StatusOK)
66+
_, err = w.Write(reqBody)
67+
require.NoError(t, err)
68+
}))
69+
defer server.Close()
70+
71+
// Create a client with the logging transport
72+
client := &http.Client{
73+
Transport: &loggingTransport{inner: http.DefaultTransport},
74+
}
75+
76+
// Test POST request with body
77+
reqBody := "test request body content"
78+
req, err := http.NewRequestWithContext(ctx, "POST", server.URL+"/echo", strings.NewReader(reqBody))
79+
require.NoError(t, err)
80+
81+
resp, err := client.Do(req)
82+
require.NoError(t, err)
83+
require.NotNil(t, resp)
84+
require.Equal(t, http.StatusOK, resp.StatusCode)
85+
86+
// Read the response body to trigger logging
87+
respBody, err := io.ReadAll(resp.Body)
88+
require.NoError(t, err)
89+
require.Equal(t, reqBody, string(respBody))
90+
resp.Body.Close()
91+
92+
closer := resp.Body.(*responseBodyTracker)
93+
94+
require.NotZero(t, closer.requestLatency)
95+
require.Equal(t, closer.status, redact.SafeString("200 OK"))
96+
require.Equal(t, closer.method, redact.SafeString("POST"))
97+
require.Equal(t, closer.url, server.URL+"/echo")
98+
require.Equal(t, closer.requestBytes, int64(len(reqBody)))
99+
require.Equal(t, closer.responseBytes, int64(len(respBody)))
100+
require.Equal(t, closer.readBytes, int64(len(respBody)))
101+
require.ErrorIs(t, closer.readErr, io.EOF)
102+
require.NotZero(t, closer.readTime)
103+
}

pkg/cloud/httpsink/http_storage_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,6 @@ func newTestingRangeHandler(
453453
w.WriteHeader(http.StatusInternalServerError)
454454
}
455455
}
456-
457456
}
458457

459458
// TestReadFileAtReturnsSize tests that ReadFileAt returns

pkg/cmd/roachtest/tests/backup_fixtures.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,12 @@ type backupDriver struct {
180180
}
181181

182182
func (bd *backupDriver) prepareCluster(ctx context.Context) {
183+
startOptions := option.NewStartOpts(option.NoBackupSchedule)
184+
startOptions.RoachprodOpts.ExtraArgs = append(
185+
startOptions.RoachprodOpts.ExtraArgs,
186+
"--vmodule=cloud_logging_transport=1")
183187
bd.c.Start(
184-
ctx, bd.t.L(), option.NewStartOpts(option.NoBackupSchedule),
188+
ctx, bd.t.L(), startOptions,
185189
install.MakeClusterSettings(
186190
install.ClusterSettingsOption{
187191
// Large imports can run into a death spiral where splits fail because

pkg/cmd/roachtest/tests/backup_restore_roundtrip.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ func backupRestoreRoundTrip(
141141
})
142142

143143
startOpts := roachtestutil.MaybeUseMemoryBudget(t, 50)
144-
startOpts.RoachprodOpts.ExtraArgs = []string{"--vmodule=split_queue=3"}
144+
startOpts.RoachprodOpts.ExtraArgs = []string{"--vmodule=split_queue=3,cloud_logging_transport=1"}
145145
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(envOption), c.CRDBNodes())
146146
m := c.NewDeprecatedMonitor(ctx, c.CRDBNodes())
147147

pkg/cmd/roachtest/tests/restore.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -700,9 +700,6 @@ type restoreSpecs struct {
700700

701701
setUpStmts []string
702702

703-
// extraArgs are passed to the cockroach binary at startup.
704-
extraArgs []string
705-
706703
// skip, if non-empty, skips the test with the given reason.
707704
skip string
708705

@@ -790,7 +787,7 @@ func (rd *restoreDriver) defaultClusterSettings() []install.ClusterSettingOption
790787

791788
func (rd *restoreDriver) roachprodOpts() option.StartOpts {
792789
opts := option.NewStartOpts(option.NoBackupSchedule)
793-
opts.RoachprodOpts.ExtraArgs = append(opts.RoachprodOpts.ExtraArgs, rd.sp.extraArgs...)
790+
opts.RoachprodOpts.ExtraArgs = append(opts.RoachprodOpts.ExtraArgs, "--vmodule=cloud_logging_transport=1")
794791
if rd.c.Spec().SSDs > 1 && !rd.c.Spec().RAID0 {
795792
opts.RoachprodOpts.StoreCount = rd.c.Spec().SSDs
796793
}

0 commit comments

Comments
 (0)