Skip to content

Commit 0e8321e

Browse files
craig[bot]dhartunian
andcommitted
Merge #151414
151414: tracing, sql: add Jaeger output option for transaction traces r=dhartunian a=dhartunian Adds a cluster setting `sql.trace.txn.jaeger_json_output.enabled` which is false by default, but when true will write Jaeger compatible JSON to the logs for transactions that meet the bar for verbose traces. Epic: None Release note: when outputting transaction traces to logs using probabilistic and/or statement latency-based triggers, users have the option to write the logs in Jaeger-compatible JSON format to the logs for easier analysis. Setting the cluster setting `sql.trace.txn.jaeger_json_output.enabled` to true, will change the output format. Co-authored-by: David Hartunian <[email protected]>
2 parents af632a6 + 71f045f commit 0e8321e

File tree

11 files changed

+85
-8
lines changed

11 files changed

+85
-8
lines changed

docs/generated/settings/settings-for-tenants.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,7 @@ sql.temp_object_cleaner.wait_interval duration 30m0s how long after creation a t
390390
sql.log.all_statements.enabled (alias: sql.trace.log_statement_execute) boolean false set to true to enable logging of all executed statements application
391391
sql.trace.stmt.enable_threshold duration 0s enables tracing on all statements; statements executing for longer than this duration will have their trace logged (set to 0 to disable); note that enabling this may have a negative performance impact; this setting applies to individual statements within a transaction and is therefore finer-grained than sql.trace.txn.enable_threshold application
392392
sql.trace.txn.enable_threshold duration 0s enables transaction traces for transactions exceeding this duration, used with `sql.trace.txn.sample_rate` application
393+
sql.trace.txn.jaeger_json_output.enabled boolean false enables Jaeger JSON output for transaction traces in logs application
393394
sql.trace.txn.sample_rate float 1 enables probabilistic transaction tracing. It should be used in conjunction with `sql.trace.txn.enable_threshold`. A percentage of transactions between 0 and 1.0 will have tracing enabled, and only those which exceed the configured threshold will be logged. application
394395
sql.ttl.changefeed_replication.disabled boolean false if true, deletes issued by TTL will not be replicated via changefeeds (this setting will be ignored by changefeeds that have the ignore_disable_changefeed_replication option set; such changefeeds will continue to replicate all TTL deletes) application
395396
sql.ttl.default_delete_batch_size integer 100 default amount of rows to delete in a single query during a TTL job application

docs/generated/settings/settings.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@
345345
<tr><td><div id="setting-sql-trace-log-statement-execute" class="anchored"><code>sql.log.all_statements.enabled<br />(alias: sql.trace.log_statement_execute)</code></div></td><td>boolean</td><td><code>false</code></td><td>set to true to enable logging of all executed statements</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
346346
<tr><td><div id="setting-sql-trace-stmt-enable-threshold" class="anchored"><code>sql.trace.stmt.enable_threshold</code></div></td><td>duration</td><td><code>0s</code></td><td>enables tracing on all statements; statements executing for longer than this duration will have their trace logged (set to 0 to disable); note that enabling this may have a negative performance impact; this setting applies to individual statements within a transaction and is therefore finer-grained than sql.trace.txn.enable_threshold</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
347347
<tr><td><div id="setting-sql-trace-txn-enable-threshold" class="anchored"><code>sql.trace.txn.enable_threshold</code></div></td><td>duration</td><td><code>0s</code></td><td>enables transaction traces for transactions exceeding this duration, used with `sql.trace.txn.sample_rate`</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
348+
<tr><td><div id="setting-sql-trace-txn-jaeger-json-output-enabled" class="anchored"><code>sql.trace.txn.jaeger_json_output.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>enables Jaeger JSON output for transaction traces in logs</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
348349
<tr><td><div id="setting-sql-trace-txn-sample-rate" class="anchored"><code>sql.trace.txn.sample_rate</code></div></td><td>float</td><td><code>1</code></td><td>enables probabilistic transaction tracing. It should be used in conjunction with `sql.trace.txn.enable_threshold`. A percentage of transactions between 0 and 1.0 will have tracing enabled, and only those which exceed the configured threshold will be logged.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
349350
<tr><td><div id="setting-sql-ttl-changefeed-replication-disabled" class="anchored"><code>sql.ttl.changefeed_replication.disabled</code></div></td><td>boolean</td><td><code>false</code></td><td>if true, deletes issued by TTL will not be replicated via changefeeds (this setting will be ignored by changefeeds that have the ignore_disable_changefeed_replication option set; such changefeeds will continue to replicate all TTL deletes)</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
350351
<tr><td><div id="setting-sql-ttl-default-delete-batch-size" class="anchored"><code>sql.ttl.default_delete_batch_size</code></div></td><td>integer</td><td><code>100</code></td><td>default amount of rows to delete in a single query during a TTL job</td><td>Serverless/Dedicated/Self-Hosted</td></tr>

pkg/bench/rttanalysis/rtt_analysis_bench.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ func executeRoundTripTest(
223223
if haveExp && !exp.matches(int(res)) && !*rewriteFlag {
224224
reportf(`%s: got %v, expected %v`, b.Name(), res, exp)
225225
dir := getDir()
226-
jaegerJSON, err := r.ToJaegerJSON(tc.Stmt, "", "n0")
226+
jaegerJSON, err := r.ToJaegerJSON(tc.Stmt, "", "n0", true /* indent */)
227227
require.NoError(b, err)
228228
path := filepath.Join(dir, strings.Replace(b.Name(), "/", "_", -1)) + ".jaeger.json"
229229
require.NoError(b, os.WriteFile(path, []byte(jaegerJSON), 0666))

pkg/cli/debug_send_kv_batch.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ func runSendKVBatch(cmd *cobra.Command, args []string) error {
199199
// all the sub-spans. With an empty string, the node ID of the
200200
// node that processes the request is properly annotated in the
201201
// Jaeger UI.
202-
j, err := rec.ToJaegerJSON(ba.Summary(), "", "")
202+
j, err := rec.ToJaegerJSON(ba.Summary(), "", "", true /* indent */)
203203
if err != nil {
204204
return err
205205
}

pkg/sql/conn_executor_exec.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1123,6 +1123,7 @@ func (ex *connExecutor) execStmtInOpenState(
11231123
redactableStmt, /* detail */
11241124
stmtTraceThreshold, /* threshold */
11251125
stmtDur, /* elapsed */
1126+
false, /* outputJaegerJSON */
11261127
)
11271128
} else {
11281129
stmtThresholdSpan.Finish()
@@ -2181,6 +2182,7 @@ func (ex *connExecutor) execStmtInOpenStateWithPausablePortal(
21812182
redactableStmt, /* detail */
21822183
stmtTraceThreshold, /* threshold */
21832184
stmtDur, /* elapsed */
2185+
false, /* outputJaegerJSON */
21842186
)
21852187
} else {
21862188
stmtThresholdSpan.Finish()
@@ -4546,11 +4548,23 @@ func logTraceAboveThreshold(
45464548
opName redact.RedactableString,
45474549
detail redact.RedactableString,
45484550
threshold, elapsed time.Duration,
4551+
outputJaegerJSON bool,
45494552
) {
45504553
if r == nil {
45514554
log.Warning(ctx, "missing trace when threshold tracing was enabled")
45524555
}
4553-
log.SqlExec.Infof(ctx, "%s took %s, exceeding threshold of %s:\n%s\n%s", opName, elapsed, threshold, detail, r)
4556+
output := ""
4557+
var err error
4558+
if outputJaegerJSON {
4559+
output, err = r.ToJaegerJSON("unknown stmt", "no comment", "unknown node", false /* indent */)
4560+
if err != nil {
4561+
log.Warningf(ctx, "trace could not be converted to jaeger JSON: %s", err.Error())
4562+
output = r.String()
4563+
}
4564+
} else {
4565+
output = r.String()
4566+
}
4567+
log.SqlExec.Infof(ctx, "%s took %s, exceeding threshold of %s:\n%s\n%s", opName, elapsed, threshold, detail, output)
45544568
}
45554569

45564570
func (ex *connExecutor) execWithProfiling(

pkg/sql/crdb_internal.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1926,7 +1926,7 @@ CREATE TABLE crdb_internal.cluster_inflight_traces (
19261926
for ; iter.Valid(); iter.Next(ctx) {
19271927
nodeID, recording := iter.Value()
19281928
traceString := recording.String()
1929-
traceJaegerJSON, err := recording.ToJaegerJSON("", "", fmt.Sprintf("node %d", nodeID))
1929+
traceJaegerJSON, err := recording.ToJaegerJSON("", "", fmt.Sprintf("node %d", nodeID), true /* indent */)
19301930
if err != nil {
19311931
return false, err
19321932
}

pkg/sql/exec_util.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,14 @@ var TraceTxnSampleRate = settings.RegisterFloatSetting(
283283
settings.NonNegativeFloatWithMaximum(1.0),
284284
settings.WithPublic)
285285

286+
// TraceTxnOutputJaegerJSON sets the output format of transaction trace logs.
287+
var TraceTxnOutputJaegerJSON = settings.RegisterBoolSetting(
288+
settings.ApplicationLevel,
289+
"sql.trace.txn.jaeger_json_output.enabled",
290+
"enables Jaeger JSON output for transaction traces in logs",
291+
false,
292+
settings.WithPublic)
293+
286294
// TraceStmtThreshold is identical to traceTxnThreshold except it applies to
287295
// individual statements in a transaction. The motivation for this setting is
288296
// to be able to reduce the noise associated with a larger transaction (e.g.

pkg/sql/explain_bundle.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ func (b *stmtBundleBuilder) addTrace() {
504504
This trace can be imported into Jaeger for visualization. From the Jaeger Search screen, select the JSON File.
505505
Jaeger can be started using docker with: docker run -d --name jaeger -p 16686:16686 jaegertracing/all-in-one:1.17
506506
The UI can then be accessed at http://localhost:16686/search`, b.stmt)
507-
jaegerJSON, err := b.trace.ToJaegerJSON(b.stmt, comment, "")
507+
jaegerJSON, err := b.trace.ToJaegerJSON(b.stmt, comment, "", true /* indent */)
508508
if err != nil {
509509
b.errorStrings = append(b.errorStrings, fmt.Sprintf("error getting jaeger trace: %v", err))
510510
b.z.AddFile("trace-jaeger.txt", err.Error())

pkg/sql/trace_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,3 +675,44 @@ func TestTraceTxnSampleRateAndThreshold(t *testing.T) {
675675
})
676676
}
677677
}
678+
679+
func TestTraceTxnJaegerOutput(t *testing.T) {
680+
defer leaktest.AfterTest(t)()
681+
defer log.Scope(t).Close(t)
682+
683+
ctx := context.Background()
684+
settings := cluster.MakeTestingClusterSettings()
685+
686+
s, db, _ := serverutils.StartServer(t, base.TestServerArgs{
687+
Settings: settings,
688+
})
689+
defer s.Stopper().Stop(ctx)
690+
691+
appLogsSpy := logtestutils.NewLogSpy(
692+
t,
693+
// This string match is constructed from the log.SqlExec.Infof format
694+
// string found in conn_executor_exec.go:logTraceAboveThreshold
695+
logtestutils.MatchesF("exceeding threshold of"),
696+
)
697+
cleanup := log.InterceptWith(ctx, appLogsSpy)
698+
defer cleanup()
699+
700+
sql.TraceTxnOutputJaegerJSON.Override(ctx, &settings.SV, true)
701+
sql.TraceTxnThreshold.Override(ctx, &settings.SV, time.Nanosecond)
702+
sql.TraceTxnSampleRate.Override(ctx, &settings.SV, 1)
703+
log.FlushAllSync()
704+
appLogsSpy.Reset()
705+
r := sqlutils.MakeSQLRunner(db)
706+
707+
testutils.SucceedsSoon(t, func() error {
708+
r.Exec(t, "SELECT pg_sleep(0.01)")
709+
log.FlushAllSync()
710+
if !appLogsSpy.Has(logtestutils.MatchesF(regexp.QuoteMeta("ExecStmt: SELECT pg_sleep(0.01)"))) {
711+
return errors.New("no sql txn log found (tracing did not happen)")
712+
}
713+
if !appLogsSpy.Has(logtestutils.MatchesF(regexp.QuoteMeta("{\"refType\":\"CHILD_OF\",\"traceID\":\""))) {
714+
return errors.New("no Jaeger JSON found")
715+
}
716+
return nil
717+
})
718+
}

pkg/sql/txn_state.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ type txnState struct {
9898
// positive duration trigger for logging.
9999
shouldRecord bool
100100

101+
// outputJaegerJSON is used to indicate whether the traces in logs
102+
// should be in a plaintext or Jaeger format.
103+
outputJaegerJSON bool
104+
101105
// recordingThreshold, is not zero, indicates that sp is recording and that
102106
// the recording should be dumped to the log if execution of the transaction
103107
// took more than this.
@@ -215,6 +219,7 @@ func (ts *txnState) resetForNewSQLTxn(
215219

216220
sampleRate := TraceTxnSampleRate.Get(&tranCtx.settings.SV)
217221
ts.shouldRecord = sampleRate > 0 && duration > 0 && rng.Float64() < sampleRate
222+
ts.outputJaegerJSON = TraceTxnOutputJaegerJSON.Get(&tranCtx.settings.SV)
218223

219224
if alreadyRecording || ts.shouldRecord {
220225
ts.Ctx, sp = tracing.EnsureChildSpan(ctx, tranCtx.tracer, opName,
@@ -300,6 +305,7 @@ func (ts *txnState) finishSQLTxn() (txnID uuid.UUID, commitTimestamp hlc.Timesta
300305
redact.Sprint(redact.Safe(txnID)), /* detail */
301306
ts.recordingThreshold, /* threshold */
302307
elapsed, /* elapsed */
308+
ts.outputJaegerJSON, /* outputJaegerJSON */
303309
)
304310
}
305311
}

0 commit comments

Comments
 (0)