Skip to content

Commit fba7dbf

Browse files
committed
sql: remove gossip-based DistSQL physical planning
In 25.1 release we switched to using SQL-instance-based DistSQL physical planning but left gossip-based planning just in case (controlled via a cluster setting). We haven't seen any fallout from that, and given that many different workloads have been upgraded to 25.2, it seems safe to completely remove gossip-based planning now. Release note: None
1 parent c482610 commit fba7dbf

File tree

4 files changed

+5
-183
lines changed

4 files changed

+5
-183
lines changed

pkg/settings/registry.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,9 @@ var retiredSettings = map[InternalKey]struct{}{
274274

275275
// removed as of 25.4
276276
"storage.columnar_blocks.enabled": {},
277+
278+
// removed as of 26.1
279+
"sql.distsql_planning.use_gossip.enabled": {},
277280
}
278281

279282
// grandfatheredDefaultSettings is the list of "grandfathered" existing sql.defaults

pkg/sql/distsql_physical_planner.go

Lines changed: 0 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,13 +1314,6 @@ const (
13141314
// SpanPartitionReason_ROUND_ROBIN is reported when there is no locality info
13151315
// on any of the instances and so we default to a naive round-robin strategy.
13161316
SpanPartitionReason_ROUND_ROBIN
1317-
// SpanPartitionReason_GOSSIP_GATEWAY_TARGET_UNHEALTHY is reported when the
1318-
// target node retrieved via gossip is deemed unhealthy. In this case we
1319-
// default to the gateway node.
1320-
SpanPartitionReason_GOSSIP_GATEWAY_TARGET_UNHEALTHY
1321-
// SpanPartitionReason_GOSSIP_TARGET_HEALTHY is reported when the
1322-
// target node retrieved via gossip is deemed healthy.
1323-
SpanPartitionReason_GOSSIP_TARGET_HEALTHY
13241317
// SpanPartitionReason_LOCALITY_FILTERED_RANDOM_GATEWAY_OVERLOADED is reported
13251318
// when there is no match to the provided locality filter and the gateway is
13261319
// eligible but overloaded with other partitions. In this case we pick a
@@ -1350,10 +1343,6 @@ func (r SpanPartitionReason) String() string {
13501343
return "locality-filtered-random"
13511344
case SpanPartitionReason_ROUND_ROBIN:
13521345
return "round-robin"
1353-
case SpanPartitionReason_GOSSIP_GATEWAY_TARGET_UNHEALTHY:
1354-
return "gossip-gateway-target-unhealthy"
1355-
case SpanPartitionReason_GOSSIP_TARGET_HEALTHY:
1356-
return "gossip-target-healthy"
13571346
case SpanPartitionReason_LOCALITY_FILTERED_RANDOM_GATEWAY_OVERLOADED:
13581347
return "locality-filtered-random-gateway-overloaded"
13591348
default:
@@ -1527,9 +1516,6 @@ func (dsp *DistSQLPlanner) partitionSpansEx(
15271516
return []SpanPartition{{SQLInstanceID: dsp.gatewaySQLInstanceID, Spans: spans}},
15281517
true /* ignoreMisplannedRanges */, nil
15291518
}
1530-
if dsp.useGossipPlanning(ctx, planCtx) {
1531-
return dsp.deprecatedPartitionSpansSystem(ctx, planCtx, spans)
1532-
}
15331519
return dsp.partitionSpans(ctx, planCtx, spans, bound)
15341520
}
15351521

@@ -1651,27 +1637,6 @@ func (dsp *DistSQLPlanner) partitionSpan(
16511637
return partitions, lastPartitionIdx, nil
16521638
}
16531639

1654-
// deprecatedPartitionSpansSystem finds node owners for ranges touching the given spans
1655-
// for a system tenant.
1656-
func (dsp *DistSQLPlanner) deprecatedPartitionSpansSystem(
1657-
ctx context.Context, planCtx *PlanningCtx, spans roachpb.Spans,
1658-
) (partitions []SpanPartition, ignoreMisplannedRanges bool, _ error) {
1659-
nodeMap := make(map[base.SQLInstanceID]int)
1660-
resolver := func(nodeID roachpb.NodeID) (base.SQLInstanceID, SpanPartitionReason) {
1661-
return dsp.deprecatedHealthySQLInstanceIDForKVNodeIDSystem(ctx, planCtx, nodeID)
1662-
}
1663-
for _, span := range spans {
1664-
var err error
1665-
partitions, _, err = dsp.partitionSpan(
1666-
ctx, planCtx, span, partitions, nodeMap, resolver, &ignoreMisplannedRanges,
1667-
)
1668-
if err != nil {
1669-
return nil, false, err
1670-
}
1671-
}
1672-
return partitions, ignoreMisplannedRanges, nil
1673-
}
1674-
16751640
// partitionSpans assigns SQL instances to spans. In mixed sql and KV mode it
16761641
// generally assigns each span to the instance hosted on the KV node chosen by
16771642
// the configured replica oracle, while in clusters operating with standalone
@@ -1730,26 +1695,6 @@ func (dsp *DistSQLPlanner) partitionSpans(
17301695
return partitions, ignoreMisplannedRanges, nil
17311696
}
17321697

1733-
// deprecatedHealthySQLInstanceIDForKVNodeIDSystem returns the SQL instance that
1734-
// should handle the range with the given node ID when planning is done on
1735-
// behalf of the system tenant. It ensures that the chosen SQL instance is
1736-
// healthy and of the compatible DistSQL version.
1737-
func (dsp *DistSQLPlanner) deprecatedHealthySQLInstanceIDForKVNodeIDSystem(
1738-
ctx context.Context, planCtx *PlanningCtx, nodeID roachpb.NodeID,
1739-
) (base.SQLInstanceID, SpanPartitionReason) {
1740-
sqlInstanceID := base.SQLInstanceID(nodeID)
1741-
status := dsp.checkInstanceHealthAndVersionSystem(ctx, planCtx, sqlInstanceID)
1742-
// If the node is unhealthy, use the gateway to process this span instead of
1743-
// the unhealthy host. An empty address indicates an unhealthy host.
1744-
reason := SpanPartitionReason_GOSSIP_TARGET_HEALTHY
1745-
if status != NodeOK {
1746-
log.VEventf(ctx, 2, "not planning on node %d: %s", sqlInstanceID, status)
1747-
sqlInstanceID = dsp.gatewaySQLInstanceID
1748-
reason = SpanPartitionReason_GOSSIP_GATEWAY_TARGET_UNHEALTHY
1749-
}
1750-
return sqlInstanceID, reason
1751-
}
1752-
17531698
// checkInstanceHealth returns the instance health status by dialing the node.
17541699
// It also caches the result to avoid redialing for a query.
17551700
func (dsp *DistSQLPlanner) checkInstanceHealth(
@@ -2129,10 +2074,6 @@ func (dsp *DistSQLPlanner) getInstanceIDForScan(
21292074
return 0, err
21302075
}
21312076

2132-
if dsp.useGossipPlanning(ctx, planCtx) {
2133-
sqlInstanceID, _ := dsp.deprecatedHealthySQLInstanceIDForKVNodeIDSystem(ctx, planCtx, replDesc.NodeID)
2134-
return sqlInstanceID, nil
2135-
}
21362077
resolver, err := dsp.makeInstanceResolver(ctx, planCtx)
21372078
if err != nil {
21382079
return 0, err
@@ -2142,23 +2083,6 @@ func (dsp *DistSQLPlanner) getInstanceIDForScan(
21422083
return sqlInstanceID, nil
21432084
}
21442085

2145-
// TODO(yuzefovich): retire this setting altogether in 25.3 release.
2146-
var useGossipPlanning = settings.RegisterBoolSetting(
2147-
settings.ApplicationLevel,
2148-
"sql.distsql_planning.use_gossip.enabled",
2149-
"if enabled, the DistSQL physical planner falls back to gossip-based planning",
2150-
false,
2151-
)
2152-
2153-
func (dsp *DistSQLPlanner) useGossipPlanning(_ context.Context, planCtx *PlanningCtx) bool {
2154-
var gossipPlanningEnabled bool
2155-
// Some of the planCtx fields can be left unset in tests.
2156-
if planCtx.ExtendedEvalCtx != nil && planCtx.ExtendedEvalCtx.Settings != nil {
2157-
gossipPlanningEnabled = useGossipPlanning.Get(&planCtx.ExtendedEvalCtx.Settings.SV)
2158-
}
2159-
return dsp.codec.ForSystemTenant() && planCtx.localityFilter.Empty() && gossipPlanningEnabled
2160-
}
2161-
21622086
// convertOrdering maps the columns in props.ordering to the output columns of a
21632087
// processor.
21642088
func (dsp *DistSQLPlanner) convertOrdering(

pkg/sql/distsql_physical_planner_test.go

Lines changed: 0 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919

2020
"github.com/cockroachdb/cockroach/pkg/base"
2121
"github.com/cockroachdb/cockroach/pkg/gossip"
22-
"github.com/cockroachdb/cockroach/pkg/keys"
2322
"github.com/cockroachdb/cockroach/pkg/kv"
2423
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
2524
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangecache"
@@ -1751,112 +1750,6 @@ func TestShouldPickGatewayNode(t *testing.T) {
17511750
}
17521751
}
17531752

1754-
// Test that a node whose descriptor info is not accessible through gossip is
1755-
// not used. This is to simulate nodes that have been decomisioned and also
1756-
// nodes that have been "replaced" by another node at the same address (which, I
1757-
// guess, is also a type of decomissioning).
1758-
func TestPartitionSpansSkipsNodesNotInGossip(t *testing.T) {
1759-
defer leaktest.AfterTest(t)()
1760-
defer log.Scope(t).Close(t)
1761-
1762-
// The spans that we're going to plan for.
1763-
span := roachpb.Span{Key: roachpb.Key("A"), EndKey: roachpb.Key("Z")}
1764-
gatewayNode := roachpb.NodeID(2)
1765-
ranges := []testSpanResolverRange{{"A", 1}, {"B", 2}, {"C", 1}}
1766-
1767-
stopper := stop.NewStopper()
1768-
defer stopper.Stop(context.Background())
1769-
codec := keys.SystemSQLCodec
1770-
1771-
mockGossip := gossip.NewTest(roachpb.NodeID(1), stopper, metric.NewRegistry())
1772-
var nodeDescs []*roachpb.NodeDescriptor
1773-
for i := 1; i <= 2; i++ {
1774-
sqlInstanceID := base.SQLInstanceID(i)
1775-
desc := &roachpb.NodeDescriptor{
1776-
NodeID: roachpb.NodeID(sqlInstanceID),
1777-
Address: util.UnresolvedAddr{AddressField: fmt.Sprintf("addr%d", i)},
1778-
}
1779-
if i == 2 {
1780-
if err := mockGossip.SetNodeDescriptor(desc); err != nil {
1781-
t.Fatal(err)
1782-
}
1783-
}
1784-
// All the nodes advertise that they are not draining. This is to
1785-
// simulate the "node overridden by another node at the same address"
1786-
// case mentioned in the test comment - for such a node, the descriptor
1787-
// would be taken out of the gossip data, but other datums it advertised
1788-
// are left in place.
1789-
if err := mockGossip.AddInfoProto(
1790-
gossip.MakeDistSQLDrainingKey(sqlInstanceID),
1791-
&execinfrapb.DistSQLDrainingInfo{
1792-
Draining: false,
1793-
},
1794-
0, // ttl - no expiration
1795-
); err != nil {
1796-
t.Fatal(err)
1797-
}
1798-
1799-
nodeDescs = append(nodeDescs, desc)
1800-
}
1801-
tsp := &testSpanResolver{
1802-
nodes: nodeDescs,
1803-
ranges: ranges,
1804-
}
1805-
1806-
st := cluster.MakeTestingClusterSettings()
1807-
gw := gossip.MakeOptionalGossip(mockGossip)
1808-
dsp := DistSQLPlanner{
1809-
st: st,
1810-
gatewaySQLInstanceID: base.SQLInstanceID(tsp.nodes[gatewayNode-1].NodeID),
1811-
stopper: stopper,
1812-
spanResolver: tsp,
1813-
gossip: gw,
1814-
nodeHealth: distSQLNodeHealth{
1815-
gossip: gw,
1816-
connHealthSystem: func(node roachpb.NodeID, _ rpcbase.ConnectionClass) error {
1817-
_, _, err := mockGossip.GetNodeIDAddress(node)
1818-
return err
1819-
},
1820-
isAvailable: func(base.SQLInstanceID) bool {
1821-
return true
1822-
},
1823-
},
1824-
codec: codec,
1825-
}
1826-
1827-
ctx := context.Background()
1828-
// This test is specific to gossip-based planning.
1829-
useGossipPlanning.Override(ctx, &st.SV, true)
1830-
planCtx := dsp.NewPlanningCtx(
1831-
ctx, &extendedEvalContext{Context: eval.Context{Codec: codec, Settings: st}},
1832-
nil /* planner */, nil /* txn */, FullDistribution,
1833-
)
1834-
partitions, err := dsp.PartitionSpans(ctx, planCtx, roachpb.Spans{span}, PartitionSpansBoundDefault)
1835-
if err != nil {
1836-
t.Fatal(err)
1837-
}
1838-
1839-
resMap := make(map[base.SQLInstanceID][][2]string)
1840-
for _, p := range partitions {
1841-
if _, ok := resMap[p.SQLInstanceID]; ok {
1842-
t.Fatalf("node %d shows up in multiple partitions", p.SQLInstanceID)
1843-
}
1844-
var spans [][2]string
1845-
for _, s := range p.Spans {
1846-
spans = append(spans, [2]string{string(s.Key), string(s.EndKey)})
1847-
}
1848-
resMap[p.SQLInstanceID] = spans
1849-
}
1850-
1851-
expectedPartitions :=
1852-
map[base.SQLInstanceID][][2]string{
1853-
2: {{"A", "Z"}},
1854-
}
1855-
if !reflect.DeepEqual(resMap, expectedPartitions) {
1856-
t.Errorf("expected partitions:\n %v\ngot:\n %v", expectedPartitions, resMap)
1857-
}
1858-
}
1859-
18601753
func TestCheckNodeHealth(t *testing.T) {
18611754
defer leaktest.AfterTest(t)()
18621755
defer log.Scope(t).Close(t)

pkg/sql/distsql_plan_bulk.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ func (dsp *DistSQLPlanner) SetupAllNodesPlanningWithOracle(
4141
localityFilter roachpb.Locality,
4242
) (*PlanningCtx, []base.SQLInstanceID, error) {
4343
if dsp.codec.ForSystemTenant() {
44+
// TODO(yuzefovich): evaluate whether we can remove system tenant
45+
// specific code.
4446
return dsp.setupAllNodesPlanningSystem(ctx, evalCtx, execCfg, oracle, localityFilter)
4547
}
4648
return dsp.setupAllNodesPlanningTenant(ctx, evalCtx, execCfg, oracle, localityFilter)

0 commit comments

Comments
 (0)