Skip to content

Commit bb602aa

Browse files
authored
fix: improve metrics performance (#890)
1 parent 3788c85 commit bb602aa

File tree

8 files changed

+15232
-10569
lines changed

8 files changed

+15232
-10569
lines changed

package-lock.json

Lines changed: 15011 additions & 10469 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,19 @@
4141
"@fastify/swagger-ui": "^4.1.0",
4242
"@isaacs/ttlcache": "^1.4.1",
4343
"@kubernetes/client-node": "^1.3.0",
44-
"@opentelemetry/api": "^1.8.0",
45-
"@opentelemetry/auto-instrumentations-node": "^0.67.3",
46-
"@opentelemetry/exporter-metrics-otlp-grpc": "^0.208.0",
47-
"@opentelemetry/exporter-prometheus": "^0.208.0",
48-
"@opentelemetry/host-metrics": "^0.38.2",
49-
"@opentelemetry/instrumentation-aws-sdk": "^0.56.0",
50-
"@opentelemetry/instrumentation-fastify": "^0.50.0",
51-
"@opentelemetry/instrumentation-http": "^0.208.0",
52-
"@opentelemetry/instrumentation-knex": "^0.53.0",
53-
"@opentelemetry/instrumentation-pg": "^0.55.0",
54-
"@opentelemetry/instrumentation-runtime-node": "^0.22.0",
55-
"@opentelemetry/sdk-metrics": "^2.2.0",
56-
"@opentelemetry/sdk-node": "^0.208.0",
44+
"@opentelemetry/api": "^1.9.0",
45+
"@opentelemetry/auto-instrumentations-node": "^0.70.1",
46+
"@opentelemetry/exporter-metrics-otlp-grpc": "^0.213.0",
47+
"@opentelemetry/exporter-prometheus": "^0.213.0",
48+
"@opentelemetry/host-metrics": "^0.38.3",
49+
"@opentelemetry/instrumentation-aws-sdk": "^0.59.0",
50+
"@opentelemetry/instrumentation-fastify": "^0.50.3",
51+
"@opentelemetry/instrumentation-http": "^0.213.0",
52+
"@opentelemetry/instrumentation-knex": "^0.53.1",
53+
"@opentelemetry/instrumentation-pg": "^0.64.0",
54+
"@opentelemetry/instrumentation-runtime-node": "^0.25.0",
55+
"@opentelemetry/sdk-metrics": "^2.6.0",
56+
"@opentelemetry/sdk-node": "^0.213.0",
5757
"@shopify/semaphore": "^3.0.2",
5858
"@smithy/node-http-handler": "^2.3.1",
5959
"@tus/file-store": "2.0.0",

src/admin-app.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ const build = (opts: FastifyServerOptions = {}): FastifyInstance => {
1616
app.register(routes.migrations, { prefix: 'migrations' })
1717
app.register(routes.s3Credentials, { prefix: 's3' })
1818
app.register(routes.queue, { prefix: 'queue' })
19+
app.register(routes.metricsConfig, { prefix: 'metrics' })
1920

2021
// Register /metrics endpoint - uses OTel Prometheus exporter
2122
if (prometheusMetricsEnabled) {

src/http/plugins/metrics.ts

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,17 @@ export const httpMetrics = (options: HttpMetricsOptions = {}) =>
8888
const statusCode = groupStatusCodes
8989
? `${Math.floor(reply.statusCode / 100)}xx`
9090
: String(reply.statusCode)
91-
const tenantId = request.tenantId || ''
9291

9392
const attributes = {
9493
method,
9594
route,
9695
operation: request.operation?.type || 'unknown',
9796
status_code: statusCode,
98-
tenantId,
97+
tenantId: request.tenantId || '',
9998
}
10099

101-
// Record metrics
100+
// Record duration (histogram count replaces httpRequestsTotal)
102101
httpRequestDuration.record(durationSeconds, attributes)
103-
httpRequestsTotal.add(1, attributes)
104102

105103
// Record request size from content-length header
106104
const requestContentLength = request.headers['content-length']

src/http/routes/admin/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
export { default as jwks } from './jwks'
2+
export { default as metricsConfig } from './metrics'
23
export { default as migrations } from './migrations'
34
export { default as objects } from './objects'
45
export { default as queue } from './queue'

src/http/routes/admin/metrics.ts

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import { getMetricsConfig, setMetricsEnabled } from '@internal/monitoring/metrics'
2+
import { FastifyInstance, RequestGenericInterface } from 'fastify'
3+
import { FromSchema } from 'json-schema-to-ts'
4+
import apiKey from '../../plugins/apikey'
5+
6+
const updateMetricsConfigSchema = {
7+
body: {
8+
type: 'object',
9+
properties: {
10+
metrics: {
11+
type: 'array',
12+
items: {
13+
type: 'object',
14+
properties: {
15+
name: { type: 'string' },
16+
enabled: { type: 'boolean' },
17+
},
18+
required: ['name', 'enabled'],
19+
},
20+
},
21+
},
22+
required: ['metrics'],
23+
},
24+
} as const
25+
26+
interface UpdateMetricsConfigRequest extends RequestGenericInterface {
27+
Body: FromSchema<typeof updateMetricsConfigSchema.body>
28+
}
29+
30+
export default async function routes(fastify: FastifyInstance) {
31+
fastify.register(apiKey)
32+
33+
fastify.get('/config', async (_request, reply) => {
34+
return reply.send({
35+
metrics: getMetricsConfig(),
36+
})
37+
})
38+
39+
fastify.put<UpdateMetricsConfigRequest>(
40+
'/config',
41+
{ schema: updateMetricsConfigSchema },
42+
async (request, reply) => {
43+
setMetricsEnabled(request.body.metrics)
44+
return reply.code(200).send({
45+
metrics: getMetricsConfig(),
46+
})
47+
}
48+
)
49+
}

src/internal/database/pool.ts

Lines changed: 26 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ export class PoolManager {
157157
*/
158158
class TenantPool implements PoolStrategy {
159159
protected pool?: Knex
160+
protected monitorHandle?: ReturnType<typeof setInterval>
160161

161162
constructor(protected readonly options: TenantConnectionOptions) {}
162163

@@ -166,10 +167,12 @@ class TenantPool implements PoolStrategy {
166167
}
167168

168169
this.pool = this.createKnexPool()
170+
this.startMonitor()
169171
return this.pool
170172
}
171173

172174
destroy(): Promise<void> {
175+
this.stopMonitor()
173176
const originalPool = this.pool
174177

175178
if (!originalPool) {
@@ -180,6 +183,27 @@ class TenantPool implements PoolStrategy {
180183
return this.drainPool(originalPool)
181184
}
182185

186+
protected startMonitor() {
187+
this.monitorHandle = setInterval(() => {
188+
const tarnPool = this.pool?.client?.pool
189+
if (!tarnPool) return
190+
191+
dbInUseConnection.record(tarnPool.numUsed(), { tenant_id: this.options.tenantId })
192+
dbActiveConnection.record(tarnPool.numUsed() + tarnPool.numFree(), {
193+
tenant_id: this.options.tenantId,
194+
})
195+
}, 2000)
196+
197+
this.monitorHandle.unref()
198+
}
199+
200+
protected stopMonitor() {
201+
if (this.monitorHandle) {
202+
clearInterval(this.monitorHandle)
203+
this.monitorHandle = undefined
204+
}
205+
}
206+
183207
getSettings() {
184208
const isSingleUseExternalPool = this.options.isSingleUse && this.options.isExternalPool
185209

@@ -208,6 +232,7 @@ class TenantPool implements PoolStrategy {
208232
return
209233
}
210234

235+
this.stopMonitor()
211236
const originalPool = this.pool
212237

213238
this.options.clusterSize = options.clusterSize
@@ -255,9 +280,8 @@ class TenantPool implements PoolStrategy {
255280
})
256281

257282
const maxConnections = settings.maxConnections
258-
const tenantId = this.options.tenantId
259283

260-
const pool = knex({
284+
return knex({
261285
client: 'pg',
262286
version: dbPostgresVersion,
263287
searchPath: settings.searchPath,
@@ -276,50 +300,5 @@ class TenantPool implements PoolStrategy {
276300
},
277301
acquireConnectionTimeout: databaseConnectionTimeout,
278302
})
279-
280-
// Track total connections in pool per tenant
281-
pool.client.pool.on('createSuccess', () => {
282-
dbActiveConnection.add(1, { tenant_id: tenantId })
283-
})
284-
285-
pool.client.pool.on('destroySuccess', () => {
286-
dbActiveConnection.add(-1, { tenant_id: tenantId })
287-
})
288-
289-
// Track in-use connections per tenant
290-
pool.client.pool.on('acquireSuccess', () => {
291-
dbInUseConnection.add(1, { tenant_id: tenantId })
292-
})
293-
294-
pool.client.pool.on('release', () => {
295-
dbInUseConnection.add(-1, { tenant_id: tenantId })
296-
})
297-
298-
// Track connection acquisition time using eventId to correlate requests with completions
299-
const pendingAcquires = new Map<number, number>()
300-
301-
pool.client.pool.on('acquireRequest', (eventId: number) => {
302-
pendingAcquires.set(eventId, performance.now())
303-
})
304-
305-
pool.client.pool.on('acquireSuccess', (eventId: number) => {
306-
const startTime = pendingAcquires.get(eventId)
307-
if (startTime !== undefined) {
308-
pendingAcquires.delete(eventId)
309-
const durationSeconds = (performance.now() - startTime) / 1000
310-
dbConnectionAcquireTime.record(durationSeconds, { tenant_id: tenantId })
311-
}
312-
})
313-
314-
pool.client.pool.on('acquireFail', (eventId: number) => {
315-
const startTime = pendingAcquires.get(eventId)
316-
if (startTime !== undefined) {
317-
pendingAcquires.delete(eventId)
318-
const durationSeconds = (performance.now() - startTime) / 1000
319-
dbConnectionAcquireTime.record(durationSeconds, { tenant_id: tenantId, failed: 'true' })
320-
}
321-
})
322-
323-
return pool
324303
}
325304
}

0 commit comments

Comments
 (0)