Skip to content

Commit c97e0f8

Browse files
DISTMYSQL-141: cluster-osc-slaves API may not return all required slaves
https://jira.percona.com/browse/DISTMYSQL-141 New algorithm implemented: ---- Stage 1: 1st tier servers. Get up to two 1st tier servers from each DC in the following order: 1. Most busiest IMs 2. Most lagging leaf nodes Examples: 1. If there are N > 1 IMs in the DC, we will use 2 busiest ones (having the highest number of replicas) 2. If there is only 1 IM in the DC, but there are some leaf nodes, we will use IM + most lagging leaf node 3. If there are no IMs in the DC, but there are leaf nodes, we will use up to two most lagging leaf nodes So this stage will collect at most 2 servers per DC ---- Stage 2: 2nd tier servers Examine all collected 1st tier servers (from Stage 1), and if they are IMs, get at most two busiest replicas for each server (2nd tier servers). So this stage will collect at most 2 replicas per IM. If we collected 2 IMs per DC in the Stage 1, here we will get 4 servers per DC. ---- Stage 3: 3rd tier servers Get 2 busiest 3rd tier replicas per DC. If replicas are leaves we get the replica with larger lag. So this stage will collect at most 2 servers per DC Added new tests 'get-heuristic-lag-multi-dc' and 'which-cluster-ocs-replicas'
1 parent e197dd8 commit c97e0f8

File tree

7 files changed

+350
-49
lines changed

7 files changed

+350
-49
lines changed

go/inst/instance_dao.go

Lines changed: 85 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,30 @@ func (this InstancesByCountReplicas) Less(i, j int) bool {
6969
return len(this[i].Replicas) < len(this[j].Replicas)
7070
}
7171

72+
// InstancesByDc is a sortable type for Instance
73+
// 1. Instances are sorted by DC
74+
// 2. Within DC group instances are sorted by replicas count
75+
// 3. Within ReplicasCount group insances are:
76+
// a) not sorted if ReplicasCount > 0
77+
// b) sorted by replication lag if ReplicasCount == 0
78+
//
79+
// DC1 < DC2
80+
// if DC1 == DC2 => len(Replicas1) < len (Replicas2)
81+
// if Replicas.cnt == 0 => replicationLag1 < replicatonLag2
82+
type InstancesByDc [](*Instance)
83+
84+
func (this InstancesByDc) Len() int { return len(this) }
85+
func (this InstancesByDc) Swap(i, j int) { this[i], this[j] = this[j], this[i] }
86+
func (this InstancesByDc) Less(i, j int) bool {
87+
if this[i].DataCenter == this[j].DataCenter {
88+
if len(this[i].Replicas) == 0 && len(this[j].Replicas) == 0 {
89+
return this[i].ReplicationLagSeconds.Int64 < this[j].ReplicationLagSeconds.Int64
90+
}
91+
return len(this[i].Replicas) < len(this[j].Replicas)
92+
}
93+
return (this[i].DataCenter < this[j].DataCenter)
94+
}
95+
7296
// Constant strings for Group Replication information
7397
// See https://dev.mysql.com/doc/refman/8.0/en/replication-group-members-table.html for additional information.
7498
const (
@@ -1718,62 +1742,91 @@ func filterOSCInstances(instances [](*Instance)) [](*Instance) {
17181742
return result
17191743
}
17201744

1745+
// Get two busiest instances per DC
1746+
func getTwoBusiestPerDC(all [](*Instance)) [](*Instance) {
1747+
result := [](*Instance){}
1748+
1749+
// sort by DC and replicas count
1750+
sort.Sort(sort.Reverse(InstancesByDc(all)))
1751+
1752+
currentDCInstances := 0
1753+
var currentDC *string = nil
1754+
1755+
for _, im := range all {
1756+
if currentDC == nil || *currentDC != im.DataCenter {
1757+
currentDCInstances = 0
1758+
currentDC = &im.DataCenter
1759+
}
1760+
if currentDCInstances > 1 {
1761+
continue
1762+
}
1763+
currentDCInstances++
1764+
result = append(result, im)
1765+
}
1766+
return result
1767+
}
1768+
17211769
// GetClusterOSCReplicas returns a heuristic list of replicas which are fit as controll replicas for an OSC operation.
17221770
// These would be intermediate masters
17231771
func GetClusterOSCReplicas(clusterName string) ([](*Instance), error) {
1724-
intermediateMasters := [](*Instance){}
1725-
result := [](*Instance){}
1726-
var err error
1727-
if strings.Index(clusterName, "'") >= 0 {
1772+
if strings.Contains(clusterName, "'") {
17281773
return [](*Instance){}, log.Errorf("Invalid cluster name: %s", clusterName)
17291774
}
1775+
1776+
result := [](*Instance){}
1777+
// Stage 1: 1st tier servers.
1778+
// We get up to two 1st tier servers from each DC in the following order:
1779+
// 1. Most busiest IMs
1780+
// 2. Most lagging leaf nodes
1781+
// Examples:
1782+
// 1. If there are N > 1 IMs in the DC, we will use 2 busiest ones
1783+
// (having the highest number of replicas)
1784+
// 2. If there is only 1 IM in the DC, but there are some leaf nodes,
1785+
// we will use IM + most lagging leaf node
1786+
// 3. If there are no IMs in the DC, but there are leaf nodes, we will use
1787+
// up to two most lagging leaf nodes
1788+
//
1789+
// So this stage will collect at most 2 servers per DC
17301790
{
1731-
// Pick up to two busiest IMs
17321791
condition := `
17331792
replication_depth = 1
1734-
and num_slave_hosts > 0
17351793
and cluster_name = ?
17361794
`
1737-
intermediateMasters, err = readInstancesByCondition(condition, sqlutils.Args(clusterName), "")
1795+
firstTierServers, err := readInstancesByCondition(condition, sqlutils.Args(clusterName), "")
17381796
if err != nil {
17391797
return result, err
17401798
}
1741-
sort.Sort(sort.Reverse(InstancesByCountReplicas(intermediateMasters)))
1742-
intermediateMasters = filterOSCInstances(intermediateMasters)
1743-
intermediateMasters = intermediateMasters[0:math.MinInt(2, len(intermediateMasters))]
1744-
result = append(result, intermediateMasters...)
1799+
1800+
firstTierServers = filterOSCInstances(firstTierServers)
1801+
result = append(result, getTwoBusiestPerDC(firstTierServers)...)
17451802
}
1803+
1804+
// Stage 2: 2nd tier servers
1805+
// Examine all selected 1st tier servers, and if they are IMs, get at most
1806+
// two of their busiest replicas (2nd tier servers).
1807+
// So this stage will collect at most 2 replicas per IM. If we collected 2 IMs
1808+
// per DC in the 1st stage, here we will get 4 servers per DC
17461809
{
1747-
// Get 2 replicas of found IMs, if possible
1748-
if len(intermediateMasters) == 1 {
1749-
// Pick 2 replicas for this IM
1750-
replicas, err := ReadReplicaInstances(&(intermediateMasters[0].Key))
1810+
// Get at most 2 replicas of found IMs
1811+
for _, im := range result {
1812+
if len(im.Replicas) == 0 {
1813+
// this is 1st tier leaf
1814+
continue
1815+
}
1816+
replicas, err := ReadReplicaInstances(&im.Key)
17511817
if err != nil {
17521818
return result, err
17531819
}
17541820
sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas)))
17551821
replicas = filterOSCInstances(replicas)
17561822
replicas = replicas[0:math.MinInt(2, len(replicas))]
17571823
result = append(result, replicas...)
1758-
1759-
}
1760-
if len(intermediateMasters) == 2 {
1761-
// Pick one replica from each IM (should be possible)
1762-
for _, im := range intermediateMasters {
1763-
replicas, err := ReadReplicaInstances(&im.Key)
1764-
if err != nil {
1765-
return result, err
1766-
}
1767-
sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas)))
1768-
replicas = filterOSCInstances(replicas)
1769-
if len(replicas) > 0 {
1770-
result = append(result, replicas[0])
1771-
}
1772-
}
17731824
}
17741825
}
1826+
1827+
// Stage 3: 3rd tier servers
1828+
// Get 2 busiest 3rd tier replicas per DC
17751829
{
1776-
// Get 2 3rd tier replicas, if possible
17771830
condition := `
17781831
replication_depth = 3
17791832
and cluster_name = ?
@@ -1782,25 +1835,8 @@ func GetClusterOSCReplicas(clusterName string) ([](*Instance), error) {
17821835
if err != nil {
17831836
return result, err
17841837
}
1785-
sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas)))
1786-
replicas = filterOSCInstances(replicas)
1787-
replicas = replicas[0:math.MinInt(2, len(replicas))]
1788-
result = append(result, replicas...)
1789-
}
1790-
{
1791-
// Get 2 1st tier leaf replicas, if possible
1792-
condition := `
1793-
replication_depth = 1
1794-
and num_slave_hosts = 0
1795-
and cluster_name = ?
1796-
`
1797-
replicas, err := readInstancesByCondition(condition, sqlutils.Args(clusterName), "")
1798-
if err != nil {
1799-
return result, err
1800-
}
18011838
replicas = filterOSCInstances(replicas)
1802-
replicas = replicas[0:math.MinInt(2, len(replicas))]
1803-
result = append(result, replicas...)
1839+
result = append(result, getTwoBusiestPerDC(replicas)...)
18041840
}
18051841

18061842
return result, nil

0 commit comments

Comments
 (0)