Skip to content

Commit 5e9ffa3

Browse files
committed
모니터링
1 parent c6d1421 commit 5e9ffa3

File tree

3 files changed

+151
-6
lines changed

3 files changed

+151
-6
lines changed

backend/src/battles/adapters/in/battles.gateway.ts

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
8181
client.data.userId = userId
8282
this.userIdToSocketMap.set(userId, client)
8383
this.metricsService.setActiveSocketConnections(this.userIdToSocketMap.size)
84+
this.metricsService.recordSocketConnectionEvent('connect')
8485
this.logger.log(`[소켓 연결] userId: ${userId}`)
8586
} catch {
87+
this.metricsService.recordSocketConnectionEvent('error')
8688
this.logger.error(`[소켓 연결 실패] - ${client.id}`)
8789
client.disconnect()
8890
}
@@ -107,6 +109,7 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
107109
if (userId) {
108110
this.userIdToSocketMap.delete(userId)
109111
this.metricsService.setActiveSocketConnections(this.userIdToSocketMap.size)
112+
this.metricsService.recordSocketConnectionEvent('disconnect')
110113
this.logger.log(`[소켓 연결 해제] userId: ${userId}`)
111114
} else {
112115
this.logger.log(`[소켓 연결 해제] - ${client.id}`)
@@ -118,6 +121,7 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
118121
@SubscribeMessage('battle:join')
119122
async joinBattle(@MessageBody() battleJoinRequestDto: BattleJoinRequestDto, @ConnectedSocket() client: SocketWithUserId) {
120123
const stopTimer = this.metricsService.startSocketTimer('battle:join')
124+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:join')
121125
try {
122126
const userId = this.getUserIdFromSocket(client)
123127

@@ -145,8 +149,10 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
145149

146150
client.emit('battle:joined', { ...res })
147151
stopTimer('success')
152+
stopActionTimer('success')
148153
} catch (error) {
149154
stopTimer('error')
155+
stopActionTimer('error')
150156
if (error instanceof Error) {
151157
client.emit('battle:join:error', {
152158
message: error.message,
@@ -157,19 +163,27 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
157163

158164
@SubscribeMessage('battle:leave')
159165
async handleLeave(@MessageBody() dto: { battleId: string }, @ConnectedSocket() client: SocketWithUserId) {
166+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:leave')
160167
const { battleId } = dto
161168
const userId = this.getUserIdFromSocket(client)
162169
const battleRoomId = getBattleRoomId(battleId)
163170

164-
const result = await this.participationUseCase.leave(userId, battleId)
165-
client.data.battleId = undefined
171+
try {
172+
const result = await this.participationUseCase.leave(userId, battleId)
173+
client.data.battleId = undefined
166174

167-
this.server.to(battleRoomId).emit('battle:leaved', result)
175+
this.server.to(battleRoomId).emit('battle:leaved', result)
176+
stopActionTimer('success')
177+
} catch (error) {
178+
stopActionTimer('error')
179+
throw error
180+
}
168181
}
169182

170183
@SubscribeMessage('battle:start')
171184
async handleStart(@MessageBody() dto: BattleStartDto) {
172185
const stopTimer = this.metricsService.startSocketTimer('battle:start')
186+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:start')
173187
try {
174188
const { battleId } = dto
175189
await this.creationUseCase.start(battleId)
@@ -178,15 +192,18 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
178192

179193
this.server.to(battleRoomId).emit('battle:started')
180194
stopTimer('success')
195+
stopActionTimer('success')
181196
} catch (error) {
182197
stopTimer('error')
198+
stopActionTimer('error')
183199
throw error
184200
}
185201
}
186202

187203
@SubscribeMessage('battle:attack')
188204
async handleAttack(@MessageBody() dto: AttackRequestDto, @ConnectedSocket() client: SocketWithUserId) {
189205
const stopTimer = this.metricsService.startSocketTimer('battle:attack')
206+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:attack')
190207
try {
191208
const userId = this.getUserIdFromSocket(client)
192209
const battleId: string = dto.battleId
@@ -197,8 +214,10 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
197214

198215
this.server.to(teamRoom).emit('battle:attack:created', attack)
199216
stopTimer('success')
217+
stopActionTimer('success')
200218
} catch (error) {
201219
stopTimer('error')
220+
stopActionTimer('error')
202221
if (error instanceof Error) {
203222
client.emit('battle:attack:error', {
204223
message: error.message,
@@ -210,6 +229,7 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
210229
@SubscribeMessage('battle:defense')
211230
async handleDefense(@MessageBody() dto: DefenseRequestDto, @ConnectedSocket() client: SocketWithUserId) {
212231
const stopTimer = this.metricsService.startSocketTimer('battle:defense')
232+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:defense')
213233
try {
214234
const userId = this.getUserIdFromSocket(client)
215235
const battleId: string = dto.battleId
@@ -220,8 +240,10 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
220240

221241
this.server.to(teamRoom).emit('battle:defense:created', defense)
222242
stopTimer('success')
243+
stopActionTimer('success')
223244
} catch (error) {
224245
stopTimer('error')
246+
stopActionTimer('error')
225247
if (error instanceof Error) {
226248
client.emit('battle:defense:error', {
227249
message: error.message,
@@ -233,6 +255,7 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
233255
@SubscribeMessage('battle:attack:vote')
234256
async handleAttackVote(@MessageBody() dto: AttackVoteRequestDto, @ConnectedSocket() client: SocketWithUserId) {
235257
const stopTimer = this.metricsService.startSocketTimer('battle:attack:vote')
258+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:attack:vote')
236259
try {
237260
const userId = this.getUserIdFromSocket(client)
238261
const battleId: string = dto.battleId
@@ -246,8 +269,10 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
246269
this.server.to(teamRoom).emit('battle:attack:voted', update)
247270
})
248271
stopTimer('success')
272+
stopActionTimer('success')
249273
} catch (error) {
250274
stopTimer('error')
275+
stopActionTimer('error')
251276
if (error instanceof Error) {
252277
client.emit('battle:attack:vote:error', {
253278
message: error.message,
@@ -259,6 +284,7 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
259284
@SubscribeMessage('battle:defense:vote')
260285
async handleDefenseVote(@MessageBody() dto: DefenseVoteRequestDto, @ConnectedSocket() client: SocketWithUserId) {
261286
const stopTimer = this.metricsService.startSocketTimer('battle:defense:vote')
287+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:defense:vote')
262288
try {
263289
const userId = this.getUserIdFromSocket(client)
264290
const battleId: string = dto.battleId
@@ -272,8 +298,10 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
272298
this.server.to(teamRoom).emit('battle:defense:voted', update)
273299
})
274300
stopTimer('success')
301+
stopActionTimer('success')
275302
} catch (error) {
276303
stopTimer('error')
304+
stopActionTimer('error')
277305
if (error instanceof Error) {
278306
client.emit('battle:defense:vote:error', {
279307
message: error.message,
@@ -285,6 +313,7 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
285313
@SubscribeMessage('battle:user:skip')
286314
async handlePhaseSkip(@MessageBody() dto: { skip: boolean; battleId: string }, @ConnectedSocket() client: SocketWithUserId) {
287315
const stopTimer = this.metricsService.startSocketTimer('battle:user:skip')
316+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:user:skip')
288317
const userId = this.getUserIdFromSocket(client)
289318
try {
290319
const { skip, battleId } = dto
@@ -294,8 +323,10 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
294323
this.server.to(battleRoomId).emit('battle:user:skipped', { totalSkips })
295324

296325
stopTimer('success')
326+
stopActionTimer('success')
297327
} catch (error) {
298328
stopTimer('error')
329+
stopActionTimer('error')
299330
if (error instanceof Error) {
300331
client.emit('battle:user:skip:error', {
301332
message: error.message,
@@ -362,6 +393,7 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
362393
@SubscribeMessage('battle:chat')
363394
async handleChat(@MessageBody() battleChatDto: BattleChatDto, @ConnectedSocket() client: SocketWithUserId) {
364395
const stopTimer = this.metricsService.startSocketTimer('battle:chat')
396+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:chat')
365397
try {
366398
const userId = this.getUserIdFromSocket(client)
367399

@@ -374,8 +406,10 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
374406
// this.server.to(roomId).emit('battle:chatted', saved)
375407
this.server.to(roomId).except(client.id).emit('battle:chatted', saved)
376408
stopTimer('success')
409+
stopActionTimer('success')
377410
} catch (error) {
378411
stopTimer('error')
412+
stopActionTimer('error')
379413
if (error instanceof Error) {
380414
client.emit('battle:chat:error', { message: error.message })
381415
}
@@ -385,15 +419,18 @@ export class BattlesGateway implements OnGatewayConnection, OnGatewayDisconnect
385419
@SubscribeMessage('battle:team:vote')
386420
async handleTeamVote(@MessageBody() dto: BattleTeamVoteDto, @ConnectedSocket() client: SocketWithUserId) {
387421
const stopTimer = this.metricsService.startSocketTimer('battle:team:vote')
422+
const stopActionTimer = this.metricsService.startServiceActionTimer('battle:team:vote')
388423
try {
389424
const userId = this.getUserIdFromSocket(client)
390425
const battleId: string = dto.battleId
391426
const team: BattleTeam = dto.team
392427

393428
await this.interactionUseCase.switchTeam(battleId, userId, team)
394429
stopTimer('success')
430+
stopActionTimer('success')
395431
} catch (error) {
396432
stopTimer('error')
433+
stopActionTimer('error')
397434
if (error instanceof Error) {
398435
client.emit('battle:team:vote:error', { message: error.message })
399436
}

backend/src/metrics/metrics.service.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { Injectable } from '@nestjs/common'
22
import { Registry, collectDefaultMetrics, Counter, Histogram, Gauge } from 'prom-client'
33

44
type SocketStatus = 'success' | 'error'
5+
type ServiceActionStatus = 'success' | 'error'
56

67
@Injectable()
78
export class MetricsService {
@@ -12,6 +13,8 @@ export class MetricsService {
1213
private readonly socketEventCount: Counter<'event' | 'status'>
1314
private readonly socketEventErrors: Counter<'event'>
1415
private readonly socketActiveConnections: Gauge<'scope'>
16+
private readonly socketConnectionEvents: Counter<'type'>
17+
private readonly serviceActionDuration: Histogram<'action' | 'status'>
1518

1619
constructor() {
1720
collectDefaultMetrics({ register: this.registry })
@@ -59,6 +62,21 @@ export class MetricsService {
5962
labelNames: ['scope'],
6063
registers: [this.registry],
6164
})
65+
66+
this.socketConnectionEvents = new Counter({
67+
name: 'socket_connection_events_total',
68+
help: 'Socket connection events',
69+
labelNames: ['type'],
70+
registers: [this.registry],
71+
})
72+
73+
this.serviceActionDuration = new Histogram({
74+
name: 'service_action_duration_seconds',
75+
help: 'Service action handling duration in seconds',
76+
labelNames: ['action', 'status'],
77+
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5],
78+
registers: [this.registry],
79+
})
6280
}
6381

6482
get contentType() {
@@ -95,6 +113,18 @@ export class MetricsService {
95113
this.socketActiveConnections.set({ scope: 'user' }, count)
96114
}
97115

116+
recordSocketConnectionEvent(type: 'connect' | 'disconnect' | 'error') {
117+
this.socketConnectionEvents.inc({ type })
118+
}
119+
120+
startServiceActionTimer(action: string) {
121+
const start = process.hrtime()
122+
return (status: ServiceActionStatus) => {
123+
const durationSeconds = this.getDurationSeconds(start)
124+
this.serviceActionDuration.observe({ action, status }, durationSeconds)
125+
}
126+
}
127+
98128
private getDurationSeconds(start: [number, number]) {
99129
const diff = process.hrtime(start)
100130
return diff[0] + diff[1] / 1e9

monitoring/grafana/provisioning/dashboards/socket-events.json

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,22 @@
3838
{
3939
"id": 2,
4040
"type": "timeseries",
41-
"title": "p95 Handling Time by event",
41+
"title": "Handling Time p50/p95/p99 by event",
4242
"datasource": "Prometheus",
4343
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
4444
"fieldConfig": { "defaults": { "unit": "s" } },
4545
"targets": [
46+
{
47+
"expr": "histogram_quantile(0.5, sum(rate(socket_event_duration_seconds_bucket{event=~\"$event\"}[5m])) by (le, event))",
48+
"legendFormat": "p50 {{event}}"
49+
},
4650
{
4751
"expr": "histogram_quantile(0.95, sum(rate(socket_event_duration_seconds_bucket{event=~\"$event\"}[5m])) by (le, event))",
48-
"legendFormat": "{{event}}"
52+
"legendFormat": "p95 {{event}}"
53+
},
54+
{
55+
"expr": "histogram_quantile(0.99, sum(rate(socket_event_duration_seconds_bucket{event=~\"$event\"}[5m])) by (le, event))",
56+
"legendFormat": "p99 {{event}}"
4957
}
5058
]
5159
},
@@ -77,19 +85,89 @@
7785
}
7886
]
7987
},
88+
{
89+
"id": 8,
90+
"type": "timeseries",
91+
"title": "Service Action p95 by action",
92+
"datasource": "Prometheus",
93+
"gridPos": { "x": 0, "y": 16, "w": 12, "h": 8 },
94+
"fieldConfig": { "defaults": { "unit": "s" } },
95+
"targets": [
96+
{
97+
"expr": "histogram_quantile(0.95, sum(rate(service_action_duration_seconds_bucket[5m])) by (le, action))",
98+
"legendFormat": "{{action}}"
99+
}
100+
]
101+
},
102+
{
103+
"id": 10,
104+
"type": "timeseries",
105+
"title": "Service Action Average by action",
106+
"datasource": "Prometheus",
107+
"gridPos": { "x": 12, "y": 16, "w": 12, "h": 8 },
108+
"fieldConfig": { "defaults": { "unit": "s" } },
109+
"targets": [
110+
{
111+
"expr": "sum(rate(service_action_duration_seconds_sum[5m])) by (action) / sum(rate(service_action_duration_seconds_count[5m])) by (action)",
112+
"legendFormat": "{{action}}"
113+
}
114+
]
115+
},
80116
{
81117
"id": 5,
82118
"type": "stat",
83119
"title": "Active Socket Connections",
84120
"datasource": "Prometheus",
85-
"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
121+
"gridPos": { "x": 0, "y": 24, "w": 24, "h": 4 },
86122
"fieldConfig": { "defaults": { "unit": "none", "decimals": 0 } },
87123
"targets": [
88124
{
89125
"expr": "socket_active_connections",
90126
"legendFormat": "connections"
91127
}
92128
]
129+
},
130+
{
131+
"id": 6,
132+
"type": "timeseries",
133+
"title": "Connection Events (rate)",
134+
"datasource": "Prometheus",
135+
"gridPos": { "x": 0, "y": 28, "w": 24, "h": 6 },
136+
"fieldConfig": { "defaults": { "unit": "ops" } },
137+
"targets": [
138+
{
139+
"expr": "sum(rate(socket_connection_events_total[1m])) by (type)",
140+
"legendFormat": "{{type}}"
141+
}
142+
]
143+
},
144+
{
145+
"id": 9,
146+
"type": "timeseries",
147+
"title": "Service Action p99 by action",
148+
"datasource": "Prometheus",
149+
"gridPos": { "x": 0, "y": 34, "w": 12, "h": 6 },
150+
"fieldConfig": { "defaults": { "unit": "s" } },
151+
"targets": [
152+
{
153+
"expr": "histogram_quantile(0.99, sum(rate(service_action_duration_seconds_bucket[5m])) by (le, action))",
154+
"legendFormat": "{{action}}"
155+
}
156+
]
157+
},
158+
{
159+
"id": 11,
160+
"type": "timeseries",
161+
"title": "Service Action Error Rate by action",
162+
"datasource": "Prometheus",
163+
"gridPos": { "x": 12, "y": 34, "w": 12, "h": 6 },
164+
"fieldConfig": { "defaults": { "unit": "percent" } },
165+
"targets": [
166+
{
167+
"expr": "100 * sum(rate(service_action_duration_seconds_count{status=\"error\"}[5m])) by (action) / sum(rate(service_action_duration_seconds_count[5m])) by (action)",
168+
"legendFormat": "{{action}}"
169+
}
170+
]
93171
}
94172
]
95173
}

0 commit comments

Comments
 (0)