Skip to content

Commit 30e254e

Browse files
authored
Merge pull request #38 from monobilisim/feat/mysql-galera-monitoring
feat(mysqlHealth): add Galera Cluster Receive Queue and Flow Control monitoring
2 parents c47685e + a1050aa commit 30e254e

File tree

6 files changed

+144
-5
lines changed

6 files changed

+144
-5
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,9 @@ These core tools will be available in every monokit installation.
153153

154154
- mysqlHealth
155155
- Checks MySQL health, including read and write operations.
156+
- Supports Galera Cluster monitoring (Receive Queue and Flow Control).
156157
- Sends alarm notifications to a Slack webhook.
158+
- Opens Redmine issues for Galera Flow Control issues.
157159
- Config: `/etc/mono/db.yaml`
158160

159161
- pgsqlHealth

common/db/main.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ type Mysql struct {
44
Process_limit int
55
Pmm_enabled *bool `json:"pmm_enabled,omitempty"`
66
Cluster struct {
7-
Enabled bool
8-
Size int
9-
Check_table_day string
10-
Check_table_hour string
7+
Enabled bool
8+
Size int
9+
Check_table_day string
10+
Check_table_hour string
11+
Receive_queue_limit int
12+
Flow_control_limit float64
1113
}
1214
Alarm struct {
1315
Enabled bool

config/db.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@ postgres:
99

1010
mysql:
1111
process_limit: 50
12-
cluster:
12+
cluster:
1313
enabled: false
1414
size: 3
1515
check_table_day: "Sun"
1616
check_table_hour: "05:00"
17+
receive_queue_limit: 10
18+
flow_control_limit: 0.20
1719
alarm:
1820
enabled: true

mysqlHealth/main.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,12 @@ func Main(cmd *cobra.Command, args []string) {
150150

151151
// Check if cluster is synced
152152
CheckClusterSynced()
153+
154+
// Check Galera receive queue
155+
CheckReceiveQueue()
156+
157+
// Check Galera flow control
158+
CheckFlowControl()
153159
}
154160

155161
// check if time matches to configured time

mysqlHealth/mysql.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@ import (
88
"fmt"
99
"os"
1010
"os/exec"
11+
"strconv"
1112
"strings"
1213
"time"
1314

1415
"github.com/go-ini/ini"
1516
_ "github.com/go-sql-driver/mysql" // Keep anonymous import for side effects
1617
_mysql "github.com/go-sql-driver/mysql" // Import with alias
1718
"github.com/monobilisim/monokit/common"
19+
issues "github.com/monobilisim/monokit/common/redmine/issues"
1820
"github.com/rs/zerolog/log"
1921
)
2022

@@ -554,6 +556,87 @@ func CheckCertificationWaiting() {
554556
}
555557
}
556558

559+
func CheckReceiveQueue() {
560+
limit := 10
561+
if DbHealthConfig.Mysql.Cluster.Receive_queue_limit != 0 {
562+
limit = DbHealthConfig.Mysql.Cluster.Receive_queue_limit
563+
}
564+
565+
query := "SHOW GLOBAL STATUS WHERE Variable_name = 'wsrep_local_recv_queue'"
566+
rows, err := Connection.Query(query)
567+
if err != nil {
568+
log.Error().Err(err).Msg("CheckReceiveQueue query failed")
569+
return
570+
}
571+
defer rows.Close()
572+
573+
var name string
574+
var value string
575+
if rows.Next() {
576+
if err := rows.Scan(&name, &value); err != nil {
577+
log.Error().Err(err).Msg("Error scanning Receive Queue count")
578+
return
579+
}
580+
}
581+
582+
count, _ := strconv.Atoi(value)
583+
584+
// Update health data
585+
healthData.ClusterInfo.ReceiveQueue.Count = count
586+
healthData.ClusterInfo.ReceiveQueue.Limit = limit
587+
healthData.ClusterInfo.ReceiveQueue.Exceeded = count > limit
588+
589+
if count > limit {
590+
msg := fmt.Sprintf("Galera Receive Queue değeri %d (Limit: %d)", count, limit)
591+
// Only send alarm (webhook), no Redmine issue for Receive Queue as requested
592+
common.AlarmCheckDown("receive queue", msg, false, "", "")
593+
} else {
594+
common.AlarmCheckUp("receive queue", fmt.Sprintf("Receive Queue OK: %d/%d", count, limit), false)
595+
}
596+
}
597+
598+
func CheckFlowControl() {
599+
threshold := 0.2
600+
if DbHealthConfig.Mysql.Cluster.Flow_control_limit != 0 {
601+
threshold = DbHealthConfig.Mysql.Cluster.Flow_control_limit
602+
}
603+
604+
query := "SHOW GLOBAL STATUS WHERE Variable_name = 'wsrep_flow_control_paused'"
605+
rows, err := Connection.Query(query)
606+
if err != nil {
607+
log.Error().Err(err).Msg("CheckFlowControl query failed")
608+
return
609+
}
610+
defer rows.Close()
611+
612+
var name string
613+
var value string
614+
if rows.Next() {
615+
if err := rows.Scan(&name, &value); err != nil {
616+
log.Error().Err(err).Msg("Error scanning Flow Control Paused value")
617+
return
618+
}
619+
}
620+
621+
paused, _ := strconv.ParseFloat(value, 64)
622+
623+
// Update health data
624+
healthData.ClusterInfo.FlowControlPaused = paused
625+
healthData.ClusterInfo.FlowControlLimit = threshold
626+
627+
if paused > threshold {
628+
msg := fmt.Sprintf("Galera Flow Control duraklama oranı %.4f (Limit: %.2f)", paused, threshold)
629+
subject := fmt.Sprintf("%s için Galera Flow Control duraklama oranı %.2f üstüne çıktı", common.Config.Identifier, threshold)
630+
631+
common.AlarmCheckDown("flow control", msg, false, "", "")
632+
issues.CheckDown("flow-control", subject, msg, false, 0)
633+
} else {
634+
msg := fmt.Sprintf("%s için Galera Flow Control duraklama oranı %.2f altına düştü", common.Config.Identifier, threshold)
635+
common.AlarmCheckUp("flow control", "Flow Control OK", false)
636+
issues.CheckUp("flow-control", msg)
637+
}
638+
}
639+
557640
func checkPMM() {
558641
// Check if PMM monitoring is enabled in config (default: enabled)
559642
if DbHealthConfig.Mysql.Pmm_enabled != nil && !*DbHealthConfig.Mysql.Pmm_enabled {

mysqlHealth/ui.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ type CertificationWaitingInfo struct {
4040
Exceeded bool
4141
}
4242

43+
// ReceiveQueueInfo contains information about Galera receive queue
44+
type ReceiveQueueInfo struct {
45+
Count int
46+
Limit int
47+
Exceeded bool
48+
}
49+
4350
// ClusterInfo contains information about MySQL cluster status
4451
type ClusterInfo struct {
4552
Enabled bool
@@ -48,6 +55,9 @@ type ClusterInfo struct {
4855
Status string
4956
Nodes []NodeInfo
5057
Synced bool
58+
ReceiveQueue ReceiveQueueInfo
59+
FlowControlPaused float64
60+
FlowControlLimit float64
5161
}
5262

5363
// NodeInfo represents a MySQL cluster node
@@ -69,6 +79,10 @@ func NewMySQLHealthData() *MySQLHealthData {
6979
return &MySQLHealthData{
7080
ClusterInfo: ClusterInfo{
7181
Nodes: []NodeInfo{},
82+
ReceiveQueue: ReceiveQueueInfo{
83+
Limit: 10,
84+
},
85+
FlowControlLimit: 0.2,
7286
},
7387
}
7488
}
@@ -162,6 +176,36 @@ func (m *MySQLHealthData) RenderCompact() string {
162176
"Cluster Nodes",
163177
nodesAccessStatus,
164178
isInaccessibleOK))
179+
sb.WriteString("\n")
180+
181+
// Receive Queue status
182+
isQueueOK := !m.ClusterInfo.ReceiveQueue.Exceeded
183+
queueStatus := "within limit"
184+
if !isQueueOK {
185+
queueStatus = "exceeds limit"
186+
}
187+
188+
sb.WriteString(common.StatusListItem(
189+
"Receive Queue",
190+
queueStatus,
191+
fmt.Sprintf("%d", m.ClusterInfo.ReceiveQueue.Limit),
192+
fmt.Sprintf("%d", m.ClusterInfo.ReceiveQueue.Count),
193+
isQueueOK))
194+
sb.WriteString("\n")
195+
196+
// Flow Control status
197+
isFlowOK := m.ClusterInfo.FlowControlPaused <= m.ClusterInfo.FlowControlLimit
198+
flowStatus := "within limit"
199+
if !isFlowOK {
200+
flowStatus = "exceeds limit"
201+
}
202+
203+
sb.WriteString(common.StatusListItem(
204+
"Flow Control",
205+
flowStatus,
206+
fmt.Sprintf("%.2f", m.ClusterInfo.FlowControlLimit),
207+
fmt.Sprintf("%.2f", m.ClusterInfo.FlowControlPaused),
208+
isFlowOK))
165209

166210
// Add cluster size display with custom color logic
167211
clusterSize := m.ClusterInfo.ClusterSize

0 commit comments

Comments
 (0)