knz
diff --git a/‎docs/generated/http/full.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/generated/http/full.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/generated/settings/settings-for-tenants.txt‎
Lines changed: 1 addition & 1 deletion b/‎docs/generated/settings/settings-for-tenants.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/generated/settings/settings.html‎
Lines changed: 1 addition & 1 deletion b/‎docs/generated/settings/settings.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/cli/clisqlclient/statement_diag.go‎
Lines changed: 52 additions & 5 deletions b/‎pkg/cli/clisqlclient/statement_diag.go‎
Lines changed: 52 additions & 5 deletions
diff --git a/‎pkg/cli/statement_diag.go‎
Lines changed: 3 additions & 3 deletions b/‎pkg/cli/statement_diag.go‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pkg/cli/statement_diag_test.go‎
Lines changed: 24 additions & 22 deletions b/‎pkg/cli/statement_diag_test.go‎
Lines changed: 24 additions & 22 deletions
diff --git a/‎pkg/cli/zip_table_registry.go‎
Lines changed: 2 additions & 0 deletions b/‎pkg/cli/zip_table_registry.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/clusterversion/cockroach_versions.go‎
Lines changed: 8 additions & 0 deletions b/‎pkg/clusterversion/cockroach_versions.go‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎pkg/server/serverpb/status.proto‎
Lines changed: 11 additions & 0 deletions b/‎pkg/server/serverpb/status.proto‎
Lines changed: 11 additions & 0 deletions
@@ -4388,6 +4388,8 @@ Support status: [reserved](#support-status)
 | min_execution_latency | [google.protobuf.Duration](#cockroach.server.serverpb.CreateStatementDiagnosticsReportRequest-google.protobuf.Duration) |  | MinExecutionLatency, when non-zero, indicates the minimum execution latency of a query for which to collect the diagnostics report. In other words, if a query executes faster than this threshold, then the diagnostics report is not collected on it, and we will try to get a bundle the next time we see the query fingerprint.<br><br>NB: if MinExecutionLatency is non-zero, then all queries that match the fingerprint will be traced until a slow enough query comes along. This tracing might have some performance overhead. | [reserved](#support-status) |
 | expires_after | [google.protobuf.Duration](#cockroach.server.serverpb.CreateStatementDiagnosticsReportRequest-google.protobuf.Duration) |  | ExpiresAfter, when non-zero, sets the expiration interval of this request. | [reserved](#support-status) |
 | sampling_probability | [double](#cockroach.server.serverpb.CreateStatementDiagnosticsReportRequest-double) |  | SamplingProbability controls how likely we are to try and collect a diagnostics report for a given execution. The semantics with MinExecutionLatency are worth noting (and perhaps simplifying?): - If SamplingProbability is zero, we're always sampling. This is for   compatibility with pre-22.2 versions where this parameter was not   available. - If SamplingProbability is non-zero, MinExecutionLatency must be non-zero.   We'll sample stmt executions with the given probability until:   (a) we capture one that exceeds MinExecutionLatency, or   (b) we hit the ExpiresAfter point.<br><br>SamplingProbability lets users control at a per-stmt granularity how much collection overhead is acceptable to try an capture an outlier execution for further analysis (are high p99.9s due to latch waits? racing with split transfers?). A high sampling rate can capture a trace sooner, but the added overhead may also cause the trace to be non-representative if the tracing overhead across all requests is causing resource saturation (network, memory) and resulting in slowdown.<br><br>TODO(irfansharif): Wire this up to the UI code. When selecting the latency threshold, we should want to force specifying a sampling probability.<br><br>TODO(irfansharif): We could do better than a hard-coded default value for probability (100% could be too high-overhead so probably not the right one). Strawman: could consider the recent request rate for the fingerprint (say averaged over the last 10m? 30m?), consider what %-ile the latency target we're looking to capture is under, and suggest a sampling probability that gets you at least one trace in the next T seconds with 95% likelihood? Or provide a hint for how long T is for the currently chosen sampling probability. | [reserved](#support-status) |
+| plan_gist | [string](#cockroach.server.serverpb.CreateStatementDiagnosticsReportRequest-string) |  | PlanGist, when set, indicates a particular plan that we want collect diagnostics for. This can be useful when a single fingerprint can result in multiple plans.<br><br>There is a caveat to using this filtering: since the plan gist for a running query is only available after the optimizer has done its part, the trace will only include things after the optimizer is done. | [reserved](#support-status) |
+| anti_plan_gist | [bool](#cockroach.server.serverpb.CreateStatementDiagnosticsReportRequest-bool) |  | AntiPlanGist, when set, indicates that any plan not matching PlanGist will do. | [reserved](#support-status) |
 
 
 
 
@@ -304,4 +304,4 @@ trace.opentelemetry.collector	string		address of an OpenTelemetry trace collecto
 trace.snapshot.rate	duration	0s	if non-zero, interval at which background trace snapshots are captured	tenant-rw
 trace.span_registry.enabled	boolean	true	if set, ongoing traces can be seen at https://<ui>/#/debug/tracez	tenant-rw
 trace.zipkin.collector	string		the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.	tenant-rw
-version	version	1000023.1-16	set the active cluster version in the format '<major>.<minor>'	tenant-rw
+version	version	1000023.1-18	set the active cluster version in the format '<major>.<minor>'	tenant-rw
@@ -260,6 +260,6 @@
 <tr><td><div id="setting-trace-span-registry-enabled" class="anchored"><code>trace.span_registry.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, ongoing traces can be seen at https://&lt;ui&gt;/#/debug/tracez</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-trace-zipkin-collector" class="anchored"><code>trace.zipkin.collector</code></div></td><td>string</td><td><code></code></td><td>the address of a Zipkin instance to receive traces, as &lt;host&gt;:&lt;port&gt;. If no port is specified, 9411 will be used.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-ui-display-timezone" class="anchored"><code>ui.display_timezone</code></div></td><td>enumeration</td><td><code>etc/utc</code></td><td>the timezone used to format timestamps in the ui [etc/utc = 0, america/new_york = 1]</td><td>Dedicated/Self-Hosted</td></tr>
-<tr><td><div id="setting-version" class="anchored"><code>version</code></div></td><td>version</td><td><code>1000023.1-16</code></td><td>set the active cluster version in the format &#39;&lt;major&gt;.&lt;minor&gt;&#39;</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
+<tr><td><div id="setting-version" class="anchored"><code>version</code></div></td><td>version</td><td><code>1000023.1-18</code></td><td>set the active cluster version in the format &#39;&lt;major&gt;.&lt;minor&gt;&#39;</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 </tbody>
 </table>
@@ -13,6 +13,7 @@ package clisqlclient
 import (
 	"context"
 	"database/sql/driver"
+	"fmt"
 	"io"
 	"os"
 	"strconv"
@@ -101,8 +102,13 @@ func stmtDiagListBundlesInternal(ctx context.Context, conn Conn) ([]StmtDiagBund
 type StmtDiagActivationRequest struct {
 	ID int64
 	// Statement is the SQL statement fingerprint.
-	Statement   string
-	RequestedAt time.Time
+	Statement string
+	// If empty then any plan will do.
+	PlanGist string
+	// If true and PlanGist is not empty, then any plan not matching the gist
+	// will do.
+	AntiPlanGist bool
+	RequestedAt  time.Time
 	// Zero value indicates that there is no sampling probability set on the
 	// request.
 	SamplingProbability float64
@@ -126,9 +132,38 @@ func StmtDiagListOutstandingRequests(
 	return result, nil
 }
 
+func isAtLeast23dot2ClusterVersion(ctx context.Context, conn Conn) (bool, error) {
+	// Check whether the upgrade to add the plan_gist and anti_plan_gist columns
+	// to the statement_diagnostics_requests system table has already been run.
+	row, err := conn.QueryRow(ctx, `
+ SELECT
+   count(*)
+ FROM
+   [SHOW COLUMNS FROM system.statement_diagnostics_requests]
+ WHERE
+   column_name = 'plan_gist';`)
+	if err != nil {
+		return false, err
+	}
+	c, ok := row[0].(int64)
+	if !ok {
+		return false, nil
+	}
+	return c == 1, nil
+}
+
 func stmtDiagListOutstandingRequestsInternal(
 	ctx context.Context, conn Conn,
 ) ([]StmtDiagActivationRequest, error) {
+	var extraColumns string
+	atLeast23dot2, err := isAtLeast23dot2ClusterVersion(ctx, conn)
+	if err != nil {
+		return nil, err
+	}
+	if atLeast23dot2 {
+		extraColumns = ", plan_gist, anti_plan_gist"
+	}
+
 	// Converting an INTERVAL to a number of milliseconds within that interval
 	// is a pain - we extract the number of seconds and multiply it by 1000,
 	// then we extract the number of milliseconds and add that up to the
@@ -138,16 +173,16 @@ func stmtDiagListOutstandingRequestsInternal(
                         EXTRACT(millisecond FROM min_execution_latency)::INT8 -
                         EXTRACT(second FROM min_execution_latency)::INT8 * 1000`
 	rows, err := conn.Query(ctx,
-		"SELECT id, statement_fingerprint, requested_at, "+getMilliseconds+`, expires_at, sampling_probability
+		fmt.Sprintf("SELECT id, statement_fingerprint, requested_at, "+getMilliseconds+`, expires_at, sampling_probability%s
 			FROM system.statement_diagnostics_requests
 			WHERE NOT completed
-			ORDER BY requested_at DESC`,
+			ORDER BY requested_at DESC`, extraColumns),
 	)
 	if err != nil {
 		return nil, err
 	}
 	var result []StmtDiagActivationRequest
-	vals := make([]driver.Value, 6)
+	vals := make([]driver.Value, 8)
 	for {
 		if err := rows.Next(vals); err == io.EOF {
 			break
@@ -157,6 +192,8 @@ func stmtDiagListOutstandingRequestsInternal(
 		var minExecutionLatency time.Duration
 		var expiresAt time.Time
 		var samplingProbability float64
+		var planGist string
+		var antiPlanGist bool
 
 		if ms, ok := vals[3].(int64); ok {
 			minExecutionLatency = time.Millisecond * time.Duration(ms)
@@ -167,9 +204,19 @@ func stmtDiagListOutstandingRequestsInternal(
 		if sp, ok := vals[5].(float64); ok {
 			samplingProbability = sp
 		}
+		if atLeast23dot2 {
+			if gist, ok := vals[6].(string); ok {
+				planGist = gist
+			}
+			if antiGist, ok := vals[7].(bool); ok {
+				antiPlanGist = antiGist
+			}
+		}
 		info := StmtDiagActivationRequest{
 			ID:                  vals[0].(int64),
 			Statement:           vals[1].(string),
+			PlanGist:            planGist,
+			AntiPlanGist:        antiPlanGist,
 			RequestedAt:         vals[2].(time.Time),
 			SamplingProbability: samplingProbability,
 			MinExecutionLatency: minExecutionLatency,
 
@@ -85,7 +85,7 @@ func runStmtDiagList(cmd *cobra.Command, args []string) (resErr error) {
 	} else {
 		fmt.Printf("Outstanding activation requests:\n")
 		w := tabwriter.NewWriter(&buf, 4, 0, 2, ' ', 0)
-		fmt.Fprint(w, "  ID\tActivation time\tStatement\tSampling probability\tMin execution latency\tExpires at\n")
+		fmt.Fprint(w, "  ID\tActivation time\tStatement\tPlan gist\tAnti plan gist\tSampling probability\tMin execution latency\tExpires at\n")
 		for _, r := range reqs {
 			minExecLatency := "N/A"
 			if r.MinExecutionLatency != 0 {
@@ -102,8 +102,8 @@ func runStmtDiagList(cmd *cobra.Command, args []string) (resErr error) {
 				samplingProbability = fmt.Sprintf("%0.4f", r.SamplingProbability)
 			}
 			fmt.Fprintf(
-				w, "  %d\t%s\t%s\t%s\t%s\t%s\n",
-				r.ID, r.RequestedAt.UTC().Format(timeFmt), r.Statement, samplingProbability, minExecLatency, expiresAt,
+				w, "  %d\t%s\t%s\t%s\t%t\t%s\t%s\t%s\n",
+				r.ID, r.RequestedAt.UTC().Format(timeFmt), r.Statement, r.PlanGist, r.AntiPlanGist, samplingProbability, minExecLatency, expiresAt,
 			)
 		}
 		_ = w.Flush()
 
@@ -31,13 +31,15 @@ func Example_statement_diag() {
 		        (20, 'SELECT _ FROM _ WHERE _ > _', 'SELECT a FROM t WHERE b > 1', '2010-01-02 03:04:06', ARRAY[1001,1002,1003]),
 		        (30, 'SELECT _ FROM _ WHERE _ > _', 'SELECT a FROM t WHERE b > 1', '2010-01-02 03:04:07', ARRAY[1001])`,
 
-		`INSERT INTO system.statement_diagnostics_requests(id, completed, statement_fingerprint, statement_diagnostics_id, requested_at, sampling_probability, min_execution_latency, expires_at)
-		 VALUES (1, TRUE, 'SELECT _ FROM _', 10, '2010-01-02 03:04:00', NULL, NULL, NULL),
-		        (2, TRUE, 'SELECT _ FROM _ WHERE _ > _', 20, '2010-01-02 03:04:02', 0.5, NULL, NULL),
-		        (3, TRUE, 'SELECT _ FROM _ WHERE _ > _', 30, '2010-01-02 03:04:05', 1.0, NULL, NULL),
-		        (4, FALSE, 'SELECT _ + _', NULL, '2010-01-02 03:04:10', 0.8, '1d 2h 3m 4s 5ms 6us', NULL),
-		        (5, FALSE, 'SELECT _ - _', NULL, '2010-01-02 03:04:11', 1.0, NULL, '2030-01-02 03:04:12'),
-		        (6, FALSE, 'SELECT _ / _', NULL, '2010-01-02 03:04:12', NULL, '0s', NULL)`,
+		`INSERT INTO system.statement_diagnostics_requests(
+                     id, completed, statement_fingerprint, plan_gist, anti_plan_gist, statement_diagnostics_id,
+                     requested_at, sampling_probability, min_execution_latency, expires_at)
+		 VALUES (1, TRUE, 'SELECT _ FROM _', '', NULL, 10, '2010-01-02 03:04:00', NULL, NULL, NULL),
+		        (2, TRUE, 'SELECT _ FROM _ WHERE _ > _', '', NULL, 20, '2010-01-02 03:04:02', 0.5, NULL, NULL),
+		        (3, TRUE, 'SELECT _ FROM _ WHERE _ > _', '', NULL, 30, '2010-01-02 03:04:05', 1.0, NULL, NULL),
+		        (4, FALSE, 'SELECT _ + _', '', NULL, NULL, '2010-01-02 03:04:10', 0.8, '1d 2h 3m 4s 5ms 6us', NULL),
+		        (5, FALSE, 'SELECT _ - _', 'foo', false, NULL, '2010-01-02 03:04:11', 1.0, NULL, '2030-01-02 03:04:12'),
+		        (6, FALSE, 'SELECT _ / _', 'bar', true, NULL, '2010-01-02 03:04:12', NULL, '0s', NULL)`,
 	}
 
 	for _, cmd := range commands {
@@ -94,10 +96,10 @@ func Example_statement_diag() {
 	//   10  2010-01-02 03:04:05 UTC  SELECT _ FROM _
 	//
 	// Outstanding activation requests:
-	//   ID  Activation time          Statement     Sampling probability  Min execution latency  Expires at
-	//   6   2010-01-02 03:04:12 UTC  SELECT _ / _  1.0000                N/A                    never
-	//   5   2010-01-02 03:04:11 UTC  SELECT _ - _  1.0000                N/A                    2030-01-02 03:04:12 +0000 UTC
-	//   4   2010-01-02 03:04:10 UTC  SELECT _ + _  0.8000                26h3m4.005s            never
+	//   ID  Activation time          Statement     Plan gist  Anti plan gist  Sampling probability  Min execution latency  Expires at
+	//   6   2010-01-02 03:04:12 UTC  SELECT _ / _  bar        true            1.0000                N/A                    never
+	//   5   2010-01-02 03:04:11 UTC  SELECT _ - _  foo        false           1.0000                N/A                    2030-01-02 03:04:12 +0000 UTC
+	//   4   2010-01-02 03:04:10 UTC  SELECT _ + _             false           0.8000                26h3m4.005s            never
 	// statement-diag download 13
 	// ERROR: failed to download statement diagnostics bundle 13 to 'stmt-bundle-13.zip': no statement diagnostics bundle with ID 13
 	// statement-diag download 20 tempfile.zip
@@ -120,18 +122,18 @@ func Example_statement_diag() {
 	//   20  2010-01-02 03:04:06 UTC  SELECT _ FROM _ WHERE _ > _
 	//
 	// Outstanding activation requests:
-	//   ID  Activation time          Statement     Sampling probability  Min execution latency  Expires at
-	//   6   2010-01-02 03:04:12 UTC  SELECT _ / _  1.0000                N/A                    never
-	//   5   2010-01-02 03:04:11 UTC  SELECT _ - _  1.0000                N/A                    2030-01-02 03:04:12 +0000 UTC
-	//   4   2010-01-02 03:04:10 UTC  SELECT _ + _  0.8000                26h3m4.005s            never
+	//   ID  Activation time          Statement     Plan gist  Anti plan gist  Sampling probability  Min execution latency  Expires at
+	//   6   2010-01-02 03:04:12 UTC  SELECT _ / _  bar        true            1.0000                N/A                    never
+	//   5   2010-01-02 03:04:11 UTC  SELECT _ - _  foo        false           1.0000                N/A                    2030-01-02 03:04:12 +0000 UTC
+	//   4   2010-01-02 03:04:10 UTC  SELECT _ + _             false           0.8000                26h3m4.005s            never
 	// statement-diag delete --all
 	// statement-diag list
 	// No statement diagnostics bundles available.
 	// Outstanding activation requests:
-	//   ID  Activation time          Statement     Sampling probability  Min execution latency  Expires at
-	//   6   2010-01-02 03:04:12 UTC  SELECT _ / _  1.0000                N/A                    never
-	//   5   2010-01-02 03:04:11 UTC  SELECT _ - _  1.0000                N/A                    2030-01-02 03:04:12 +0000 UTC
-	//   4   2010-01-02 03:04:10 UTC  SELECT _ + _  0.8000                26h3m4.005s            never
+	//   ID  Activation time          Statement     Plan gist  Anti plan gist  Sampling probability  Min execution latency  Expires at
+	//   6   2010-01-02 03:04:12 UTC  SELECT _ / _  bar        true            1.0000                N/A                    never
+	//   5   2010-01-02 03:04:11 UTC  SELECT _ - _  foo        false           1.0000                N/A                    2030-01-02 03:04:12 +0000 UTC
+	//   4   2010-01-02 03:04:10 UTC  SELECT _ + _             false           0.8000                26h3m4.005s            never
 	// statement-diag cancel xx
 	// ERROR: invalid ID
 	// statement-diag cancel 5 6
@@ -142,9 +144,9 @@ func Example_statement_diag() {
 	// statement-diag list
 	// No statement diagnostics bundles available.
 	// Outstanding activation requests:
-	//   ID  Activation time          Statement     Sampling probability  Min execution latency  Expires at
-	//   6   2010-01-02 03:04:12 UTC  SELECT _ / _  1.0000                N/A                    never
-	//   5   2010-01-02 03:04:11 UTC  SELECT _ - _  1.0000                N/A                    2030-01-02 03:04:12 +0000 UTC
+	//   ID  Activation time          Statement     Plan gist  Anti plan gist  Sampling probability  Min execution latency  Expires at
+	//   6   2010-01-02 03:04:12 UTC  SELECT _ / _  bar        true            1.0000                N/A                    never
+	//   5   2010-01-02 03:04:11 UTC  SELECT _ - _  foo        false           1.0000                N/A                    2030-01-02 03:04:12 +0000 UTC
 	// statement-diag cancel 123
 	// ERROR: no outstanding activation request with ID 123
 	// statement-diag cancel --all
 
@@ -1232,6 +1232,8 @@ var zipSystemTables = DebugZipTableRegistry{
 			"min_execution_latency",
 			"expires_at",
 			"sampling_probability",
+			"plan_gist",
+			"anti_plan_gist",
 		},
 	},
 	"system.table_statistics": {
 
@@ -553,6 +553,10 @@ const (
 	// FormatVirtualSSTables, allowing use of virtual sstables in Pebble.
 	V23_2_PebbleFormatVirtualSSTables
 
+	// V23_2_StmtDiagForPlanGist enables statement diagnostic feature to collect
+	// the bundle for particular plan gist.
+	V23_2_StmtDiagForPlanGist
+
 	// *************************************************
 	// Step (1) Add new versions here.
 	// Do not add new versions to a patch release.
@@ -962,6 +966,10 @@ var rawVersionsSingleton = keyedVersions{
 		Key:     V23_2_PebbleFormatVirtualSSTables,
 		Version: roachpb.Version{Major: 23, Minor: 1, Internal: 16},
 	},
+	{
+		Key:     V23_2_StmtDiagForPlanGist,
+		Version: roachpb.Version{Major: 23, Minor: 1, Internal: 18},
+	},
 
 	// *************************************************
 	// Step (2): Add new versions here.
 
@@ -1800,6 +1800,17 @@ message CreateStatementDiagnosticsReportRequest {
   // likelihood? Or provide a hint for how long T is for the currently chosen
   // sampling probability.
   double sampling_probability = 4;
+  // PlanGist, when set, indicates a particular plan that we want collect
+  // diagnostics for. This can be useful when a single fingerprint can result in
+  // multiple plans.
+  //
+  // There is a caveat to using this filtering: since the plan gist for a
+  // running query is only available after the optimizer has done its part, the
+  // trace will only include things after the optimizer is done.
+  string plan_gist = 5;
+  // AntiPlanGist, when set, indicates that any plan not matching PlanGist will
+  // do.
+  bool anti_plan_gist = 6;
 }
 
 message CreateStatementDiagnosticsReportResponse {
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ func runStmtDiagList(cmd *cobra.Command, args []string) (resErr error) {`
`85`	`85`	`} else {`
`86`	`86`	`fmt.Printf("Outstanding activation requests:\n")`
`87`	`87`	`w := tabwriter.NewWriter(&buf, 4, 0, 2, ' ', 0)`
`88`		`- fmt.Fprint(w, " ID\tActivation time\tStatement\tSampling probability\tMin execution latency\tExpires at\n")`
	`88`	`+ fmt.Fprint(w, " ID\tActivation time\tStatement\tPlan gist\tAnti plan gist\tSampling probability\tMin execution latency\tExpires at\n")`
`89`	`89`	`for _, r := range reqs {`
`90`	`90`	`minExecLatency := "N/A"`
`91`	`91`	`if r.MinExecutionLatency != 0 {`
`@@ -102,8 +102,8 @@ func runStmtDiagList(cmd *cobra.Command, args []string) (resErr error) {`
`102`	`102`	`samplingProbability = fmt.Sprintf("%0.4f", r.SamplingProbability)`
`103`	`103`	`}`
`104`	`104`	`fmt.Fprintf(`
`105`		`- w, " %d\t%s\t%s\t%s\t%s\t%s\n",`
`106`		`- r.ID, r.RequestedAt.UTC().Format(timeFmt), r.Statement, samplingProbability, minExecLatency, expiresAt,`
	`105`	`+ w, " %d\t%s\t%s\t%s\t%t\t%s\t%s\t%s\n",`
	`106`	`+ r.ID, r.RequestedAt.UTC().Format(timeFmt), r.Statement, r.PlanGist, r.AntiPlanGist, samplingProbability, minExecLatency, expiresAt,`
`107`	`107`	`)`
`108`	`108`	`}`
`109`	`109`	`_ = w.Flush()`