Skip to content

Commit 7055b85

Browse files
FEATURE (metrics): Add metrics for RAM & IO (first implementation)
1 parent 0abc222 commit 7055b85

38 files changed

+2205
-729
lines changed

backend/cmd/main.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"postgresus-backend/internal/features/disk"
2121
healthcheck_attempt "postgresus-backend/internal/features/healthcheck/attempt"
2222
healthcheck_config "postgresus-backend/internal/features/healthcheck/config"
23+
postgres_monitoring_collectors "postgresus-backend/internal/features/monitoring/postgres/collectors"
2324
postgres_monitoring_metrics "postgresus-backend/internal/features/monitoring/postgres/metrics"
2425
postgres_monitoring_settings "postgresus-backend/internal/features/monitoring/postgres/settings"
2526
"postgresus-backend/internal/features/notifiers"
@@ -180,7 +181,6 @@ func setUpRoutes(r *gin.Engine) {
180181
}
181182

182183
func setUpDependencies() {
183-
backups.SetupDependencies()
184184
backups.SetupDependencies()
185185
restores.SetupDependencies()
186186
healthcheck_config.SetupDependencies()
@@ -204,12 +204,16 @@ func runBackgroundTasks(log *slog.Logger) {
204204
})
205205

206206
go runWithPanicLogging(log, "healthcheck attempt background service", func() {
207-
healthcheck_attempt.GetHealthcheckAttemptBackgroundService().RunBackgroundTasks()
207+
healthcheck_attempt.GetHealthcheckAttemptBackgroundService().Run()
208208
})
209209

210210
go runWithPanicLogging(log, "postgres monitoring metrics background service", func() {
211211
postgres_monitoring_metrics.GetPostgresMonitoringMetricsBackgroundService().Run()
212212
})
213+
214+
go runWithPanicLogging(log, "postgres monitoring collectors background service", func() {
215+
postgres_monitoring_collectors.GetDbMonitoringBackgroundService().Run()
216+
})
213217
}
214218

215219
func runWithPanicLogging(log *slog.Logger, serviceName string, fn func()) {

backend/internal/features/healthcheck/attempt/background_service.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ type HealthcheckAttemptBackgroundService struct {
1313
logger *slog.Logger
1414
}
1515

16-
func (s *HealthcheckAttemptBackgroundService) RunBackgroundTasks() {
16+
func (s *HealthcheckAttemptBackgroundService) Run() {
1717
// first healthcheck immediately
1818
s.checkDatabases()
1919

Lines changed: 290 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,292 @@
11
package postgres_monitoring_collectors
22

3-
type DbMonitoringBackgroundService struct{}
3+
import (
4+
"context"
5+
"fmt"
6+
"log/slog"
7+
"postgresus-backend/internal/config"
8+
"postgresus-backend/internal/features/databases"
9+
"postgresus-backend/internal/features/databases/databases/postgresql"
10+
postgres_monitoring_metrics "postgresus-backend/internal/features/monitoring/postgres/metrics"
11+
postgres_monitoring_settings "postgresus-backend/internal/features/monitoring/postgres/settings"
12+
"sync"
13+
"sync/atomic"
14+
"time"
15+
16+
"github.com/google/uuid"
17+
"github.com/jackc/pgx/v5"
18+
)
19+
20+
type DbMonitoringBackgroundService struct {
21+
databaseService *databases.DatabaseService
22+
monitoringSettingsService *postgres_monitoring_settings.PostgresMonitoringSettingsService
23+
metricsService *postgres_monitoring_metrics.PostgresMonitoringMetricService
24+
logger *slog.Logger
25+
isRunning int32
26+
lastRunTimes map[uuid.UUID]time.Time
27+
lastRunTimesMutex sync.RWMutex
28+
}
29+
30+
func (s *DbMonitoringBackgroundService) Run() {
31+
for {
32+
if config.IsShouldShutdown() {
33+
s.logger.Info("stopping background monitoring tasks")
34+
return
35+
}
36+
37+
s.processMonitoringTasks()
38+
time.Sleep(1 * time.Second)
39+
}
40+
}
41+
42+
func (s *DbMonitoringBackgroundService) processMonitoringTasks() {
43+
if !atomic.CompareAndSwapInt32(&s.isRunning, 0, 1) {
44+
s.logger.Warn("skipping background task execution, previous task still running")
45+
return
46+
}
47+
defer atomic.StoreInt32(&s.isRunning, 0)
48+
49+
dbsWithEnabledDbMonitoring, err := s.monitoringSettingsService.GetAllDbsWithEnabledDbMonitoring()
50+
if err != nil {
51+
s.logger.Error("failed to get all databases with enabled db monitoring", "error", err)
52+
return
53+
}
54+
55+
for _, dbSettings := range dbsWithEnabledDbMonitoring {
56+
s.processDatabase(&dbSettings)
57+
}
58+
}
59+
60+
func (s *DbMonitoringBackgroundService) processDatabase(
61+
settings *postgres_monitoring_settings.PostgresMonitoringSettings,
62+
) {
63+
db, err := s.databaseService.GetDatabaseByID(settings.DatabaseID)
64+
if err != nil {
65+
s.logger.Error("failed to get database by id", "error", err)
66+
return
67+
}
68+
69+
if db.Type != databases.DatabaseTypePostgres {
70+
return
71+
}
72+
73+
if !s.isReadyForNextRun(settings) {
74+
return
75+
}
76+
77+
err = s.collectAndSaveMetrics(db, settings)
78+
if err != nil {
79+
s.logger.Error("failed to collect and save metrics", "error", err)
80+
return
81+
}
82+
83+
s.updateLastRunTime(db)
84+
}
85+
86+
func (s *DbMonitoringBackgroundService) collectAndSaveMetrics(
87+
db *databases.Database,
88+
settings *postgres_monitoring_settings.PostgresMonitoringSettings,
89+
) error {
90+
if db.Postgresql == nil {
91+
return nil
92+
}
93+
94+
s.logger.Debug("collecting metrics for database", "database_id", db.ID)
95+
96+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
97+
defer cancel()
98+
99+
conn, err := s.connectToDatabase(ctx, db)
100+
if err != nil {
101+
return fmt.Errorf("failed to connect to database: %w", err)
102+
}
103+
104+
if conn == nil {
105+
return nil
106+
}
107+
108+
defer func() {
109+
if closeErr := conn.Close(ctx); closeErr != nil {
110+
s.logger.Error("Failed to close connection", "error", closeErr)
111+
}
112+
}()
113+
114+
var metrics []postgres_monitoring_metrics.PostgresMonitoringMetric
115+
now := time.Now().UTC()
116+
117+
if settings.IsDbResourcesMonitoringEnabled {
118+
dbMetrics, err := s.collectDatabaseResourceMetrics(ctx, conn, db.ID, now)
119+
if err != nil {
120+
s.logger.Error("failed to collect database resource metrics", "error", err)
121+
} else {
122+
metrics = append(metrics, dbMetrics...)
123+
}
124+
}
125+
126+
if len(metrics) > 0 {
127+
if err := s.metricsService.Insert(metrics); err != nil {
128+
return fmt.Errorf("failed to insert metrics: %w", err)
129+
}
130+
s.logger.Debug(
131+
"successfully collected and saved metrics",
132+
"count",
133+
len(metrics),
134+
"database_id",
135+
db.ID,
136+
)
137+
}
138+
139+
return nil
140+
}
141+
142+
func (s *DbMonitoringBackgroundService) isReadyForNextRun(
143+
settings *postgres_monitoring_settings.PostgresMonitoringSettings,
144+
) bool {
145+
s.lastRunTimesMutex.RLock()
146+
defer s.lastRunTimesMutex.RUnlock()
147+
148+
if s.lastRunTimes == nil {
149+
return true
150+
}
151+
152+
lastRun, exists := s.lastRunTimes[settings.DatabaseID]
153+
if !exists {
154+
return true
155+
}
156+
157+
return time.Since(lastRun) >= time.Duration(settings.MonitoringIntervalSeconds)*time.Second
158+
}
159+
160+
func (s *DbMonitoringBackgroundService) updateLastRunTime(db *databases.Database) {
161+
s.lastRunTimesMutex.Lock()
162+
defer s.lastRunTimesMutex.Unlock()
163+
164+
if s.lastRunTimes == nil {
165+
s.lastRunTimes = make(map[uuid.UUID]time.Time)
166+
}
167+
s.lastRunTimes[db.ID] = time.Now().UTC()
168+
}
169+
170+
func (s *DbMonitoringBackgroundService) connectToDatabase(
171+
ctx context.Context,
172+
db *databases.Database,
173+
) (*pgx.Conn, error) {
174+
if db.Postgresql == nil {
175+
return nil, nil
176+
}
177+
178+
if db.Postgresql.Database == nil || *db.Postgresql.Database == "" {
179+
return nil, nil
180+
}
181+
182+
connStr := s.buildConnectionString(db.Postgresql)
183+
return pgx.Connect(ctx, connStr)
184+
}
185+
186+
func (s *DbMonitoringBackgroundService) buildConnectionString(
187+
pg *postgresql.PostgresqlDatabase,
188+
) string {
189+
sslMode := "disable"
190+
if pg.IsHttps {
191+
sslMode = "require"
192+
}
193+
194+
return fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
195+
pg.Host,
196+
pg.Port,
197+
pg.Username,
198+
pg.Password,
199+
*pg.Database,
200+
sslMode,
201+
)
202+
}
203+
204+
func (s *DbMonitoringBackgroundService) collectDatabaseResourceMetrics(
205+
ctx context.Context,
206+
conn *pgx.Conn,
207+
databaseID uuid.UUID,
208+
timestamp time.Time,
209+
) ([]postgres_monitoring_metrics.PostgresMonitoringMetric, error) {
210+
var metrics []postgres_monitoring_metrics.PostgresMonitoringMetric
211+
212+
// Collect I/O statistics
213+
ioMetrics, err := s.collectIOMetrics(ctx, conn, databaseID, timestamp)
214+
if err != nil {
215+
s.logger.Warn("failed to collect I/O metrics", "error", err)
216+
} else {
217+
metrics = append(metrics, ioMetrics...)
218+
}
219+
220+
// Collect memory usage (approximation based on buffer usage)
221+
ramMetric, err := s.collectRAMUsageMetric(ctx, conn, databaseID, timestamp)
222+
if err != nil {
223+
s.logger.Warn("failed to collect RAM usage metric", "error", err)
224+
} else {
225+
metrics = append(metrics, ramMetric)
226+
}
227+
228+
return metrics, nil
229+
}
230+
231+
func (s *DbMonitoringBackgroundService) collectIOMetrics(
232+
ctx context.Context,
233+
conn *pgx.Conn,
234+
databaseID uuid.UUID,
235+
timestamp time.Time,
236+
) ([]postgres_monitoring_metrics.PostgresMonitoringMetric, error) {
237+
var blocksRead, blocksHit int64
238+
query := `
239+
SELECT
240+
COALESCE(SUM(blks_read), 0) as total_reads,
241+
COALESCE(SUM(blks_hit), 0) as total_hits
242+
FROM pg_stat_database
243+
WHERE datname = current_database()
244+
`
245+
246+
err := conn.QueryRow(ctx, query).Scan(&blocksRead, &blocksHit)
247+
if err != nil {
248+
return nil, err
249+
}
250+
251+
// Calculate I/O activity as total blocks accessed (PostgreSQL block size is typically 8KB)
252+
const pgBlockSize = 8192 // 8KB
253+
totalIOBytes := float64((blocksRead + blocksHit) * pgBlockSize)
254+
255+
return []postgres_monitoring_metrics.PostgresMonitoringMetric{
256+
{
257+
DatabaseID: databaseID,
258+
Metric: postgres_monitoring_metrics.MetricsTypeDbIO,
259+
ValueType: postgres_monitoring_metrics.MetricsValueTypeByte,
260+
Value: totalIOBytes,
261+
CreatedAt: timestamp,
262+
},
263+
}, nil
264+
}
265+
266+
func (s *DbMonitoringBackgroundService) collectRAMUsageMetric(
267+
ctx context.Context,
268+
conn *pgx.Conn,
269+
databaseID uuid.UUID,
270+
timestamp time.Time,
271+
) (postgres_monitoring_metrics.PostgresMonitoringMetric, error) {
272+
var sharedBuffers int64
273+
query := `
274+
SELECT
275+
COALESCE(SUM(blks_hit), 0) * 8192 as buffer_usage
276+
FROM pg_stat_database
277+
WHERE datname = current_database()
278+
`
279+
280+
err := conn.QueryRow(ctx, query).Scan(&sharedBuffers)
281+
if err != nil {
282+
return postgres_monitoring_metrics.PostgresMonitoringMetric{}, err
283+
}
284+
285+
return postgres_monitoring_metrics.PostgresMonitoringMetric{
286+
DatabaseID: databaseID,
287+
Metric: postgres_monitoring_metrics.MetricsTypeDbRAM,
288+
ValueType: postgres_monitoring_metrics.MetricsValueTypeByte,
289+
Value: float64(sharedBuffers),
290+
CreatedAt: timestamp,
291+
}, nil
292+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package postgres_monitoring_collectors
2+
3+
import (
4+
"postgresus-backend/internal/features/databases"
5+
postgres_monitoring_metrics "postgresus-backend/internal/features/monitoring/postgres/metrics"
6+
postgres_monitoring_settings "postgresus-backend/internal/features/monitoring/postgres/settings"
7+
"postgresus-backend/internal/util/logger"
8+
"sync"
9+
)
10+
11+
var dbMonitoringBackgroundService = &DbMonitoringBackgroundService{
12+
databases.GetDatabaseService(),
13+
postgres_monitoring_settings.GetPostgresMonitoringSettingsService(),
14+
postgres_monitoring_metrics.GetPostgresMonitoringMetricsService(),
15+
logger.GetLogger(),
16+
0,
17+
nil,
18+
sync.RWMutex{},
19+
}
20+
21+
func GetDbMonitoringBackgroundService() *DbMonitoringBackgroundService {
22+
return dbMonitoringBackgroundService
23+
}

backend/internal/features/monitoring/postgres/collectors/system_monitoring_service.go

Lines changed: 0 additions & 3 deletions
This file was deleted.

backend/internal/features/monitoring/postgres/metrics/enums.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,8 @@ package postgres_monitoring_metrics
33
type PostgresMonitoringMetricType string
44

55
const (
6-
// system resources (need extensions)
7-
MetricsTypeSystemCPU PostgresMonitoringMetricType = "SYSTEM_CPU_USAGE"
8-
MetricsTypeSystemRAM PostgresMonitoringMetricType = "SYSTEM_RAM_USAGE"
9-
MetricsTypeSystemROM PostgresMonitoringMetricType = "SYSTEM_ROM_USAGE"
10-
MetricsTypeSystemIO PostgresMonitoringMetricType = "SYSTEM_IO_USAGE"
116
// db resources (don't need extensions)
127
MetricsTypeDbRAM PostgresMonitoringMetricType = "DB_RAM_USAGE"
13-
MetricsTypeDbROM PostgresMonitoringMetricType = "DB_ROM_USAGE"
148
MetricsTypeDbIO PostgresMonitoringMetricType = "DB_IO_USAGE"
159
)
1610

0 commit comments

Comments
 (0)