Skip to content

Commit a48d8b2

Browse files
Merge pull request #3 from kamil-holubicki/DISTMYSQL-141
DISTMYSQL-141: cluster-osc-slaves API may not return all required slaves
2 parents e197dd8 + c97e0f8 commit a48d8b2

File tree

7 files changed

+350
-49
lines changed

7 files changed

+350
-49
lines changed

go/inst/instance_dao.go

Lines changed: 85 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,30 @@ func (this InstancesByCountReplicas) Less(i, j int) bool {
6969
return len(this[i].Replicas) < len(this[j].Replicas)
7070
}
7171

72+
// InstancesByDc is a sortable type for Instance
73+
// 1. Instances are sorted by DC
74+
// 2. Within DC group instances are sorted by replicas count
75+
// 3. Within ReplicasCount group insances are:
76+
// a) not sorted if ReplicasCount > 0
77+
// b) sorted by replication lag if ReplicasCount == 0
78+
//
79+
// DC1 < DC2
80+
// if DC1 == DC2 => len(Replicas1) < len (Replicas2)
81+
// if Replicas.cnt == 0 => replicationLag1 < replicatonLag2
82+
type InstancesByDc [](*Instance)
83+
84+
func (this InstancesByDc) Len() int { return len(this) }
85+
func (this InstancesByDc) Swap(i, j int) { this[i], this[j] = this[j], this[i] }
86+
func (this InstancesByDc) Less(i, j int) bool {
87+
if this[i].DataCenter == this[j].DataCenter {
88+
if len(this[i].Replicas) == 0 && len(this[j].Replicas) == 0 {
89+
return this[i].ReplicationLagSeconds.Int64 < this[j].ReplicationLagSeconds.Int64
90+
}
91+
return len(this[i].Replicas) < len(this[j].Replicas)
92+
}
93+
return (this[i].DataCenter < this[j].DataCenter)
94+
}
95+
7296
// Constant strings for Group Replication information
7397
// See https://dev.mysql.com/doc/refman/8.0/en/replication-group-members-table.html for additional information.
7498
const (
@@ -1718,62 +1742,91 @@ func filterOSCInstances(instances [](*Instance)) [](*Instance) {
17181742
return result
17191743
}
17201744

1745+
// Get two busiest instances per DC
1746+
func getTwoBusiestPerDC(all [](*Instance)) [](*Instance) {
1747+
result := [](*Instance){}
1748+
1749+
// sort by DC and replicas count
1750+
sort.Sort(sort.Reverse(InstancesByDc(all)))
1751+
1752+
currentDCInstances := 0
1753+
var currentDC *string = nil
1754+
1755+
for _, im := range all {
1756+
if currentDC == nil || *currentDC != im.DataCenter {
1757+
currentDCInstances = 0
1758+
currentDC = &im.DataCenter
1759+
}
1760+
if currentDCInstances > 1 {
1761+
continue
1762+
}
1763+
currentDCInstances++
1764+
result = append(result, im)
1765+
}
1766+
return result
1767+
}
1768+
17211769
// GetClusterOSCReplicas returns a heuristic list of replicas which are fit as controll replicas for an OSC operation.
17221770
// These would be intermediate masters
17231771
func GetClusterOSCReplicas(clusterName string) ([](*Instance), error) {
1724-
intermediateMasters := [](*Instance){}
1725-
result := [](*Instance){}
1726-
var err error
1727-
if strings.Index(clusterName, "'") >= 0 {
1772+
if strings.Contains(clusterName, "'") {
17281773
return [](*Instance){}, log.Errorf("Invalid cluster name: %s", clusterName)
17291774
}
1775+
1776+
result := [](*Instance){}
1777+
// Stage 1: 1st tier servers.
1778+
// We get up to two 1st tier servers from each DC in the following order:
1779+
// 1. Most busiest IMs
1780+
// 2. Most lagging leaf nodes
1781+
// Examples:
1782+
// 1. If there are N > 1 IMs in the DC, we will use 2 busiest ones
1783+
// (having the highest number of replicas)
1784+
// 2. If there is only 1 IM in the DC, but there are some leaf nodes,
1785+
// we will use IM + most lagging leaf node
1786+
// 3. If there are no IMs in the DC, but there are leaf nodes, we will use
1787+
// up to two most lagging leaf nodes
1788+
//
1789+
// So this stage will collect at most 2 servers per DC
17301790
{
1731-
// Pick up to two busiest IMs
17321791
condition := `
17331792
replication_depth = 1
1734-
and num_slave_hosts > 0
17351793
and cluster_name = ?
17361794
`
1737-
intermediateMasters, err = readInstancesByCondition(condition, sqlutils.Args(clusterName), "")
1795+
firstTierServers, err := readInstancesByCondition(condition, sqlutils.Args(clusterName), "")
17381796
if err != nil {
17391797
return result, err
17401798
}
1741-
sort.Sort(sort.Reverse(InstancesByCountReplicas(intermediateMasters)))
1742-
intermediateMasters = filterOSCInstances(intermediateMasters)
1743-
intermediateMasters = intermediateMasters[0:math.MinInt(2, len(intermediateMasters))]
1744-
result = append(result, intermediateMasters...)
1799+
1800+
firstTierServers = filterOSCInstances(firstTierServers)
1801+
result = append(result, getTwoBusiestPerDC(firstTierServers)...)
17451802
}
1803+
1804+
// Stage 2: 2nd tier servers
1805+
// Examine all selected 1st tier servers, and if they are IMs, get at most
1806+
// two of their busiest replicas (2nd tier servers).
1807+
// So this stage will collect at most 2 replicas per IM. If we collected 2 IMs
1808+
// per DC in the 1st stage, here we will get 4 servers per DC
17461809
{
1747-
// Get 2 replicas of found IMs, if possible
1748-
if len(intermediateMasters) == 1 {
1749-
// Pick 2 replicas for this IM
1750-
replicas, err := ReadReplicaInstances(&(intermediateMasters[0].Key))
1810+
// Get at most 2 replicas of found IMs
1811+
for _, im := range result {
1812+
if len(im.Replicas) == 0 {
1813+
// this is 1st tier leaf
1814+
continue
1815+
}
1816+
replicas, err := ReadReplicaInstances(&im.Key)
17511817
if err != nil {
17521818
return result, err
17531819
}
17541820
sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas)))
17551821
replicas = filterOSCInstances(replicas)
17561822
replicas = replicas[0:math.MinInt(2, len(replicas))]
17571823
result = append(result, replicas...)
1758-
1759-
}
1760-
if len(intermediateMasters) == 2 {
1761-
// Pick one replica from each IM (should be possible)
1762-
for _, im := range intermediateMasters {
1763-
replicas, err := ReadReplicaInstances(&im.Key)
1764-
if err != nil {
1765-
return result, err
1766-
}
1767-
sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas)))
1768-
replicas = filterOSCInstances(replicas)
1769-
if len(replicas) > 0 {
1770-
result = append(result, replicas[0])
1771-
}
1772-
}
17731824
}
17741825
}
1826+
1827+
// Stage 3: 3rd tier servers
1828+
// Get 2 busiest 3rd tier replicas per DC
17751829
{
1776-
// Get 2 3rd tier replicas, if possible
17771830
condition := `
17781831
replication_depth = 3
17791832
and cluster_name = ?
@@ -1782,25 +1835,8 @@ func GetClusterOSCReplicas(clusterName string) ([](*Instance), error) {
17821835
if err != nil {
17831836
return result, err
17841837
}
1785-
sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas)))
1786-
replicas = filterOSCInstances(replicas)
1787-
replicas = replicas[0:math.MinInt(2, len(replicas))]
1788-
result = append(result, replicas...)
1789-
}
1790-
{
1791-
// Get 2 1st tier leaf replicas, if possible
1792-
condition := `
1793-
replication_depth = 1
1794-
and num_slave_hosts = 0
1795-
and cluster_name = ?
1796-
`
1797-
replicas, err := readInstancesByCondition(condition, sqlutils.Args(clusterName), "")
1798-
if err != nil {
1799-
return result, err
1800-
}
18011838
replicas = filterOSCInstances(replicas)
1802-
replicas = replicas[0:math.MinInt(2, len(replicas))]
1803-
result = append(result, replicas...)
1839+
result = append(result, getTwoBusiestPerDC(replicas)...)
18041840
}
18051841

18061842
return result, nil

0 commit comments

Comments
 (0)