-
Notifications
You must be signed in to change notification settings - Fork 1
Open
Description
π MEDIUM: Monitoring and Logging - Insufficient Observability and Debugging Capabilities
Problem Description
The NSX application lacks comprehensive monitoring, logging, and observability features, making it difficult to debug issues, track performance, and maintain system health in production.
Current problematic patterns:
// server/lib/Logger.ts - Basic logging only
export default class Logger {
static log() { console.log() }
static info(message: string) { console.log(message) }
static warn(message: string) { console.warn(message) }
static error(message: string) { console.error(message) }
}
// server/routes/post.ts - No structured logging
router.get('/post/:id', async (req: Request, res: Response) => {
try {
const post = await prisma.post.findFirst({
where: { id: parseInt(req.params.id, 10) },
})
res.status(200).json(post)
} catch (error: unknown) {
if (error instanceof Error) {
Logger.error(error) // β No context, no structured data
res.status(500).json({ error: error.message })
}
}
})Why This Is Problematic
- Debugging Difficulties: No structured logging makes it hard to trace issues
- Performance Blindness: No metrics to identify bottlenecks
- Security Gaps: No monitoring for suspicious activities
- Operational Issues: No alerts for system failures
- User Experience: No visibility into user behavior and errors
Current Impact
- Severity: Medium-High
- Debugging: Poor - difficult to trace issues
- Operations: Limited - no proactive monitoring
- Security: Medium - no threat detection
Comprehensive Solution
1. Implement Structured Logging
// server/lib/logging/structuredLogger.ts
import winston from 'winston'
import { Request, Response } from 'express'
interface LogContext {
requestId?: string
userId?: string
ip?: string
userAgent?: string
method?: string
url?: string
statusCode?: number
duration?: number
error?: Error
metadata?: Record<string, any>
}
class StructuredLogger {
private logger: winston.Logger
constructor() {
this.logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'nsx-api',
version: process.env.npm_package_version || '1.0.0',
environment: process.env.NODE_ENV || 'development'
},
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
}),
new winston.transports.File({
filename: 'logs/error.log',
level: 'error',
maxsize: 5242880, // 5MB
maxFiles: 5
}),
new winston.transports.File({
filename: 'logs/combined.log',
maxsize: 5242880, // 5MB
maxFiles: 5
})
]
})
}
info(message: string, context?: LogContext) {
this.logger.info(message, context)
}
warn(message: string, context?: LogContext) {
this.logger.warn(message, context)
}
error(message: string, context?: LogContext) {
this.logger.error(message, context)
}
debug(message: string, context?: LogContext) {
this.logger.debug(message, context)
}
// Request logging
logRequest(req: Request, res: Response, duration: number) {
const context: LogContext = {
requestId: req.headers['x-request-id'] as string,
method: req.method,
url: req.url,
statusCode: res.statusCode,
duration,
ip: req.ip,
userAgent: req.get('user-agent')
}
const level = res.statusCode >= 400 ? 'warn' : 'info'
this.logger[level]('HTTP Request', context)
}
// Error logging with full context
logError(error: Error, context?: LogContext) {
this.logger.error('Application Error', {
...context,
error: {
name: error.name,
message: error.message,
stack: error.stack
}
})
}
// Security event logging
logSecurityEvent(event: string, context?: LogContext) {
this.logger.warn(`Security Event: ${event}`, {
...context,
severity: 'HIGH',
category: 'SECURITY'
})
}
// Performance logging
logPerformance(operation: string, duration: number, context?: LogContext) {
this.logger.info(`Performance: ${operation}`, {
...context,
duration,
category: 'PERFORMANCE'
})
}
}
export const logger = new StructuredLogger()2. Add Application Metrics
// server/lib/monitoring/metrics.ts
import { register, Counter, Histogram, Gauge } from 'prom-client'
// HTTP request metrics
export const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10]
})
export const httpRequestTotal = new Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
})
// Database metrics
export const dbQueryDuration = new Histogram({
name: 'database_query_duration_seconds',
help: 'Duration of database queries in seconds',
labelNames: ['operation', 'table'],
buckets: [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5]
})
export const dbQueryTotal = new Counter({
name: 'database_queries_total',
help: 'Total number of database queries',
labelNames: ['operation', 'table', 'status']
})
// Business metrics
export const postsCreated = new Counter({
name: 'posts_created_total',
help: 'Total number of posts created',
labelNames: ['user_id']
})
export const usersActive = new Gauge({
name: 'users_active',
help: 'Number of active users',
labelNames: ['time_window']
})
// Error metrics
export const errorsTotal = new Counter({
name: 'errors_total',
help: 'Total number of errors',
labelNames: ['type', 'severity', 'endpoint']
})
// System metrics
export const memoryUsage = new Gauge({
name: 'memory_usage_bytes',
help: 'Memory usage in bytes',
labelNames: ['type']
})
export const cpuUsage = new Gauge({
name: 'cpu_usage_percent',
help: 'CPU usage percentage'
})
// Register all metrics
register.registerMetric(httpRequestDuration)
register.registerMetric(httpRequestTotal)
register.registerMetric(dbQueryDuration)
register.registerMetric(dbQueryTotal)
register.registerMetric(postsCreated)
register.registerMetric(usersActive)
register.registerMetric(errorsTotal)
register.registerMetric(memoryUsage)
register.registerMetric(cpuUsage)3. Create Monitoring Middleware
// server/lib/monitoring/middleware.ts
import { Request, Response, NextFunction } from 'express'
import { httpRequestDuration, httpRequestTotal, errorsTotal } from './metrics'
import { logger } from '../logging/structuredLogger'
// Request timing middleware
export const requestTiming = (req: Request, res: Response, next: NextFunction) => {
const startTime = Date.now()
res.on('finish', () => {
const duration = (Date.now() - startTime) / 1000
const route = req.route?.path || req.path
// Log request
logger.logRequest(req, res, duration)
// Record metrics
httpRequestDuration
.labels(req.method, route, res.statusCode.toString())
.observe(duration)
httpRequestTotal
.labels(req.method, route, res.statusCode.toString())
.inc()
// Record error metrics
if (res.statusCode >= 400) {
errorsTotal
.labels('http', 'error', route)
.inc()
}
})
next()
}
// Error tracking middleware
export const errorTracking = (error: Error, req: Request, res: Response, next: NextFunction) => {
// Log error with context
logger.logError(error, {
requestId: req.headers['x-request-id'] as string,
method: req.method,
url: req.url,
ip: req.ip,
userAgent: req.get('user-agent'),
body: req.body,
params: req.params,
query: req.query
})
// Record error metrics
errorsTotal
.labels('application', 'error', req.route?.path || req.path)
.inc()
next(error)
}
// Database query monitoring
export const dbQueryMonitoring = async <T>(
operation: () => Promise<T>,
operationName: string,
tableName: string
): Promise<T> => {
const startTime = Date.now()
try {
const result = await operation()
const duration = (Date.now() - startTime) / 1000
// Log performance
logger.logPerformance(`DB Query: ${operationName}`, duration, {
operation: operationName,
table: tableName
})
// Record metrics
dbQueryDuration
.labels(operationName, tableName)
.observe(duration)
dbQueryTotal
.labels(operationName, tableName, 'success')
.inc()
return result
} catch (error) {
const duration = (Date.now() - startTime) / 1000
// Log error
logger.logError(error as Error, {
operation: operationName,
table: tableName,
duration
})
// Record error metrics
dbQueryTotal
.labels(operationName, tableName, 'error')
.inc()
throw error
}
}4. Add Health Check Endpoints
// server/routes/health.ts
import { Request, Response } from 'express'
import { register } from 'prom-client'
import { prisma } from '../lib/database'
import { logger } from '../lib/logging/structuredLogger'
import { ResponseBuilder } from '../lib/api/responseBuilder'
// Basic health check
router.get('/health', (req: Request, res: Response) => {
const health = {
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
memory: process.memoryUsage(),
version: process.env.npm_package_version || '1.0.0',
environment: process.env.NODE_ENV || 'development'
}
ResponseBuilder.success(res, health, 'Service is healthy')
})
// Detailed health check
router.get('/health/detailed', async (req: Request, res: Response) => {
const checks = {
database: await checkDatabase(),
memory: checkMemory(),
disk: checkDiskSpace(),
external: await checkExternalServices()
}
const allHealthy = Object.values(checks).every(check => check.status === 'healthy')
const status = allHealthy ? 'healthy' : 'unhealthy'
ResponseBuilder.success(res, {
status,
timestamp: new Date().toISOString(),
checks
}, allHealthy ? 'All systems healthy' : 'Some systems unhealthy')
})
// Metrics endpoint
router.get('/metrics', async (req: Request, res: Response) => {
try {
const metrics = await register.metrics()
res.set('Content-Type', register.contentType)
res.send(metrics)
} catch (error) {
logger.error('Failed to generate metrics', { error })
res.status(500).send('Internal Server Error')
}
})
// Helper functions
async function checkDatabase() {
try {
await prisma.$queryRaw`SELECT 1`
return { status: 'healthy', responseTime: Date.now() }
} catch (error) {
logger.error('Database health check failed', { error })
return { status: 'unhealthy', error: error.message }
}
}
function checkMemory() {
const usage = process.memoryUsage()
const totalMB = usage.heapTotal / 1024 / 1024
const usedMB = usage.heapUsed / 1024 / 1024
const usagePercent = (usedMB / totalMB) * 100
return {
status: usagePercent > 90 ? 'unhealthy' : 'healthy',
usage: {
total: totalMB,
used: usedMB,
percentage: usagePercent
}
}
}
function checkDiskSpace() {
// This would require a filesystem check
// For now, return healthy
return { status: 'healthy', message: 'Disk space check not implemented' }
}
async function checkExternalServices() {
// Check external dependencies
const services = {
database: await checkDatabase(),
// Add other external services here
}
const allHealthy = Object.values(services).every(service => service.status === 'healthy')
return {
status: allHealthy ? 'healthy' : 'unhealthy',
services
}
}5. Add Security Monitoring
// server/lib/monitoring/security.ts
import { Request, Response } from 'express'
import { logger } from '../logging/structuredLogger'
import { errorsTotal } from './metrics'
interface SecurityEvent {
type: 'AUTH_FAILURE' | 'RATE_LIMIT' | 'SUSPICIOUS_ACTIVITY' | 'INVALID_INPUT'
severity: 'LOW' | 'MEDIUM' | 'HIGH' | 'CRITICAL'
details: Record<string, any>
}
export class SecurityMonitor {
private static suspiciousIPs = new Map<string, number>()
private static failedLogins = new Map<string, number>()
static trackAuthFailure(req: Request, username: string) {
const ip = req.ip || 'unknown'
const key = `${ip}:${username}`
// Increment failure count
const failures = this.failedLogins.get(key) || 0
this.failedLogins.set(key, failures + 1)
// Log security event
logger.logSecurityEvent('Authentication Failure', {
requestId: req.headers['x-request-id'] as string,
ip,
username,
userAgent: req.get('user-agent'),
failureCount: failures + 1,
metadata: {
type: 'AUTH_FAILURE',
severity: failures > 5 ? 'HIGH' : 'MEDIUM'
}
})
// Record metrics
errorsTotal
.labels('auth_failure', 'security', req.route?.path || req.path)
.inc()
// Check for suspicious activity
if (failures > 5) {
this.trackSuspiciousActivity(req, 'Multiple failed logins', {
username,
failureCount: failures + 1
})
}
}
static trackSuspiciousActivity(req: Request, reason: string, details: Record<string, any>) {
const ip = req.ip || 'unknown'
// Track suspicious IPs
const count = this.suspiciousIPs.get(ip) || 0
this.suspiciousIPs.set(ip, count + 1)
logger.logSecurityEvent('Suspicious Activity', {
requestId: req.headers['x-request-id'] as string,
ip,
reason,
userAgent: req.get('user-agent'),
metadata: {
type: 'SUSPICIOUS_ACTIVITY',
severity: 'HIGH',
details
}
})
// Record metrics
errorsTotal
.labels('suspicious_activity', 'security', req.route?.path || req.path)
.inc()
}
static trackRateLimit(req: Request, limit: number) {
logger.logSecurityEvent('Rate Limit Exceeded', {
requestId: req.headers['x-request-id'] as string,
ip: req.ip,
userAgent: req.get('user-agent'),
metadata: {
type: 'RATE_LIMIT',
severity: 'MEDIUM',
limit
}
})
}
static trackInvalidInput(req: Request, input: any, validationErrors: any[]) {
logger.logSecurityEvent('Invalid Input', {
requestId: req.headers['x-request-id'] as string,
ip: req.ip,
userAgent: req.get('user-agent'),
metadata: {
type: 'INVALID_INPUT',
severity: 'LOW',
input,
validationErrors
}
})
}
static getSecurityMetrics() {
return {
suspiciousIPs: Object.fromEntries(this.suspiciousIPs),
failedLogins: Object.fromEntries(this.failedLogins)
}
}
static resetCounters() {
this.suspiciousIPs.clear()
this.failedLogins.clear()
}
}6. Add Performance Monitoring
// server/lib/monitoring/performance.ts
import { performance } from 'perf_hooks'
import { logger } from '../logging/structuredLogger'
import { dbQueryDuration, dbQueryTotal } from './metrics'
export class PerformanceMonitor {
private static activeOperations = new Map<string, number>()
static startOperation(operationId: string): number {
const startTime = performance.now()
this.activeOperations.set(operationId, startTime)
return startTime
}
static endOperation(operationId: string, operationName: string, tableName?: string) {
const startTime = this.activeOperations.get(operationId)
if (!startTime) return
const duration = (performance.now() - startTime) / 1000
this.activeOperations.delete(operationId)
// Log performance
logger.logPerformance(operationName, duration, {
operationId,
table: tableName
})
// Record metrics
if (tableName) {
dbQueryDuration
.labels(operationName, tableName)
.observe(duration)
dbQueryTotal
.labels(operationName, tableName, 'success')
.inc()
}
return duration
}
static trackMemoryUsage() {
const usage = process.memoryUsage()
logger.info('Memory Usage', {
category: 'PERFORMANCE',
memory: {
rss: usage.rss,
heapTotal: usage.heapTotal,
heapUsed: usage.heapUsed,
external: usage.external
}
})
}
static trackCPUUsage() {
const usage = process.cpuUsage()
logger.info('CPU Usage', {
category: 'PERFORMANCE',
cpu: {
user: usage.user,
system: usage.system
}
})
}
}7. Add Alerting System
// server/lib/monitoring/alerts.ts
import { logger } from '../logging/structuredLogger'
interface AlertRule {
name: string
condition: () => boolean
severity: 'LOW' | 'MEDIUM' | 'HIGH' | 'CRITICAL'
message: string
cooldown: number // minutes
}
export class AlertManager {
private static alerts = new Map<string, number>()
private static rules: AlertRule[] = [
{
name: 'high_error_rate',
condition: () => {
// Check if error rate is above 5%
// This would need to be implemented with actual metrics
return false
},
severity: 'HIGH',
message: 'Error rate is above 5%',
cooldown: 15
},
{
name: 'high_memory_usage',
condition: () => {
const usage = process.memoryUsage()
const usagePercent = (usage.heapUsed / usage.heapTotal) * 100
return usagePercent > 90
},
severity: 'CRITICAL',
message: 'Memory usage is above 90%',
cooldown: 5
},
{
name: 'database_connection_failed',
condition: () => {
// This would need to be implemented with actual database health checks
return false
},
severity: 'CRITICAL',
message: 'Database connection failed',
cooldown: 1
}
]
static checkAlerts() {
for (const rule of this.rules) {
if (rule.condition()) {
this.triggerAlert(rule)
}
}
}
private static triggerAlert(rule: AlertRule) {
const now = Date.now()
const lastTriggered = this.alerts.get(rule.name)
// Check cooldown
if (lastTriggered && (now - lastTriggered) < (rule.cooldown * 60 * 1000)) {
return
}
// Trigger alert
this.alerts.set(rule.name, now)
logger.error(`ALERT: ${rule.name}`, {
category: 'ALERT',
severity: rule.severity,
message: rule.message,
timestamp: new Date().toISOString()
})
// Send to external alerting service
this.sendAlert(rule)
}
private static sendAlert(rule: AlertRule) {
// Implement external alerting (Slack, PagerDuty, etc.)
console.log(`π¨ ALERT: ${rule.message}`)
}
static startAlerting() {
// Check alerts every minute
setInterval(() => {
this.checkAlerts()
}, 60000)
}
}Testing the Implementation
1. Logging Tests
// tests/logging.test.ts
import { logger } from '../server/lib/logging/structuredLogger'
describe('Structured Logging', () => {
it('should log with proper structure', () => {
const consoleSpy = jest.spyOn(console, 'log')
logger.info('Test message', {
requestId: 'test-123',
userId: 'user-456'
})
expect(consoleSpy).toHaveBeenCalledWith(
expect.stringContaining('Test message')
)
})
})2. Metrics Tests
// tests/metrics.test.ts
import { httpRequestTotal } from '../server/lib/monitoring/metrics'
describe('Metrics', () => {
it('should record HTTP request metrics', () => {
const initialValue = httpRequestTotal.labels('GET', '/test', '200').get()
httpRequestTotal.labels('GET', '/test', '200').inc()
const newValue = httpRequestTotal.labels('GET', '/test', '200').get()
expect(newValue).toBe(initialValue + 1)
})
})Migration Strategy
Phase 1: Basic Logging
- Implement structured logging
- Add request/response logging
- Set up log rotation
Phase 2: Metrics and Monitoring
- Add Prometheus metrics
- Implement health checks
- Add performance monitoring
Phase 3: Advanced Features
- Add security monitoring
- Implement alerting
- Set up dashboards
Monitoring and Alerting
1. Log Aggregation
# Example ELK stack configuration
elasticsearch:
hosts: ["localhost:9200"]
logstash:
input:
file:
path: "/app/logs/*.log"
filter:
json:
source: "message"
output:
elasticsearch:
hosts: ["localhost:9200"]2. Metrics Dashboard
# Example Grafana dashboard configuration
dashboard:
title: "NSX API Metrics"
panels:
- title: "Request Rate"
type: "graph"
targets:
- expr: "rate(http_requests_total[5m])"
- title: "Error Rate"
type: "graph"
targets:
- expr: "rate(errors_total[5m])"
- title: "Response Time"
type: "graph"
targets:
- expr: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"References
Priority: π‘ Medium
Estimated Effort: 12-16 hours
Risk if not fixed: Medium-High - Poor observability and debugging capabilities
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels