diff --git a/Dockerfile b/Dockerfile index a9e6dc79..bba34fd3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,4 +13,4 @@ COPY . *.env ./ RUN yarn install --no-progress && \ yarn build-release -CMD node --experimental-json-modules build/main.js +CMD node --max-old-space-size=1024 --max-semi-space-size=128 --optimize-for-size --gc-interval=100 --expose-gc --experimental-json-modules build/main.js diff --git a/src/db/index.ts b/src/db/index.ts index e31ae385..59c0c6a2 100644 --- a/src/db/index.ts +++ b/src/db/index.ts @@ -49,10 +49,19 @@ export const connectDB = async (onConnected: () => any = defaultFn): Promise { logger.error('MongoDB connection error', e) - process.exit(1) + // Don't exit immediately, let the app try to reconnect + // process.exit(1) } ) + mongoose.connection.on('disconnected', () => { + logger.warn('MongoDB disconnected. Attempting to reconnect...') + }) + + mongoose.connection.on('reconnected', () => { + logger.info('MongoDB reconnected successfully') + }) + await mongoose.connect( `${scheme}://${user}:${pass}@${server}/${dbName}?authSource=${authDb}&tls=${tlsFlag}&replicaSet=${rsName}`, { autoIndex: true } diff --git a/src/db/utils/jobs/migration/SirvClient.ts b/src/db/utils/jobs/migration/SirvClient.ts index e8dcd246..eb3e15b0 100644 --- a/src/db/utils/jobs/migration/SirvClient.ts +++ b/src/db/utils/jobs/migration/SirvClient.ts @@ -1,4 +1,5 @@ import axios from 'axios' +import { CircuitBreaker, retryWithBackoff } from '../../../../utils/CircuitBreaker' const SIRV_CONFIG = { clientId: process.env.SIRV_CLIENT_ID_RO ?? null, @@ -9,9 +10,27 @@ const client = axios.create({ baseURL: 'https://api.sirv.com/v2', headers: { 'content-type': 'application/json' - } + }, + timeout: 30000 // 30 second timeout }) +// Add axios interceptors for better error handling +client.interceptors.response.use( + response => response, + async error => { + console.error('Sirv API error:', { + status: error.response?.status, + statusText: error.response?.statusText, + data: error.response?.data, + config: { + method: error.config?.method, + url: error.config?.url + } + }) + return await Promise.reject(error) + } +) + const headers = { 'content-type': 'application/json' } @@ -21,6 +40,13 @@ interface TokenParamsType { clientSecret: string | null } +// Circuit breaker for Sirv API calls +const sirvCircuitBreaker = new CircuitBreaker({ + failureThreshold: 3, + resetTimeout: 60000, // 1 minute + monitoringPeriod: 10000 // 10 seconds +}) + const getToken = async (): Promise => { const params: TokenParamsType = { clientId: SIRV_CONFIG.clientId, @@ -28,16 +54,19 @@ const getToken = async (): Promise => { } try { - const res = await client.post( - '/token', - params) + const res = await sirvCircuitBreaker.execute(async () => { + return await retryWithBackoff(async () => { + return await client.post('/token', params) + }, 3, 1000, 5000) + }) if (res.status === 200) { return res.data.token } } catch (e) { - console.error(e) - process.exit(1) + console.error('Failed to get Sirv token after retries:', e) + // Don't exit process - let the app continue without Sirv functionality + return null } return null } @@ -57,22 +86,31 @@ interface FileMetadaata { * @returns */ export const getFileInfo = async (filename: string): Promise => { - const res = await client.get( - '/files/stat?filename=' + encodeURIComponent(filename), - { - headers: { - ...headers, - Authorization: `bearer ${token}` - } - } - ) - - if (res.status === 200) { - const { ctime, mtime } = res.data - return ({ - btime: new Date(ctime), - mtime: new Date(mtime) + try { + const res = await sirvCircuitBreaker.execute(async () => { + return await retryWithBackoff(async () => { + return await client.get( + '/files/stat?filename=' + encodeURIComponent(filename), + { + headers: { + ...headers, + Authorization: `bearer ${token}` + } + } + ) + }, 3, 1000, 5000) }) + + if (res.status === 200) { + const { ctime, mtime } = res.data + return ({ + btime: new Date(ctime), + mtime: new Date(mtime) + }) + } + throw new Error('Sirv API.getFileInfo() error: ' + String(res.statusText)) + } catch (e) { + console.error('Failed to get file info after retries:', e) + throw e } - throw new Error('Sirv API.getFileInfo() error' + res.statusText) } diff --git a/src/main.ts b/src/main.ts index 70b015df..3ae30c64 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,5 +1,57 @@ import { connectDB, defaultPostConnect } from './db/index.js' import { createServer } from './server.js' +import { errorMonitor, setupGlobalErrorHandlers } from './utils/ErrorMonitor.js' -await connectDB(defaultPostConnect) -await createServer() +// Setup enhanced error monitoring +setupGlobalErrorHandlers() + +// Enhanced error handling with graceful shutdown +let isShuttingDown = false + +process.on('uncaughtException', (error) => { + console.error('Uncaught Exception:', error) + errorMonitor.logError(error, 'UNCAUGHT_EXCEPTION') + + if (!isShuttingDown) { + isShuttingDown = true + // Give some time for cleanup before exiting + setTimeout(() => { + console.log('Final error stats:', errorMonitor.getStats()) + process.exit(1) + }, 5000) + } +}) + +process.on('unhandledRejection', (reason, promise) => { + console.error('Unhandled Rejection at:', promise, 'reason:', reason) + const error = reason instanceof Error ? reason : new Error(String(reason)) + errorMonitor.logError(error, 'UNHANDLED_REJECTION', { promise }) + + // Don't exit immediately on unhandled rejections in production + // Log the error and continue running + if (process.env.NODE_ENV !== 'production') { + if (!isShuttingDown) { + isShuttingDown = true + setTimeout(() => process.exit(1), 5000) + } + } +}) + +process.on('SIGTERM', () => { + console.log('SIGTERM received, shutting down gracefully') + process.exit(0) +}) + +process.on('SIGINT', () => { + console.log('SIGINT received, shutting down gracefully') + process.exit(0) +}) + +try { + await connectDB(defaultPostConnect) + await createServer() + console.log('🚀 Server started successfully') +} catch (error) { + console.error('Failed to start server:', error) + process.exit(1) +} diff --git a/src/server.ts b/src/server.ts index 6eab9995..d6543edf 100644 --- a/src/server.ts +++ b/src/server.ts @@ -49,7 +49,8 @@ export async function createServer (): Promise<{ app: express.Application, serve schema, plugins: [ApolloServerPluginDrainHttpServer({ httpServer })], cache: new InMemoryLRUCache({ - max: 100 + max: 50, + maxSize: 1024 * 1024 * 10 }) }) // server must be started before applying middleware @@ -57,8 +58,22 @@ export async function createServer (): Promise<{ app: express.Application, serve const context = process.env.LOCAL_DEV_BYPASS_AUTH === 'true' ? localDevBypassAuthContext : createContext + app.get('/health', (req, res) => { + const memUsage = process.memoryUsage() + res.json({ + status: 'ok', + timestamp: new Date().toISOString(), + memory: { + rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB`, + heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, + heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, + external: `${Math.round(memUsage.external / 1024 / 1024)}MB` + } + }) + }) + app.use('/', - bodyParser.json({ limit: '10mb' }), + bodyParser.json({ limit: '5mb' }), cors(), express.json(), expressMiddleware(server, { diff --git a/src/utils/CircuitBreaker.ts b/src/utils/CircuitBreaker.ts new file mode 100644 index 00000000..aa838c7b --- /dev/null +++ b/src/utils/CircuitBreaker.ts @@ -0,0 +1,120 @@ +/** + * Circuit breaker pattern implementation for handling network failures + */ + +export enum CircuitState { + CLOSED = 'CLOSED', + OPEN = 'OPEN', + HALF_OPEN = 'HALF_OPEN' +} + +export interface CircuitBreakerOptions { + failureThreshold: number + resetTimeout: number + monitoringPeriod: number +} + +export class CircuitBreaker { + private state: CircuitState = CircuitState.CLOSED + private failureCount: number = 0 + private lastFailureTime?: number + private successCount: number = 0 + + constructor ( + private readonly options: CircuitBreakerOptions = { + failureThreshold: 5, + resetTimeout: 60000, // 1 minute + monitoringPeriod: 10000 // 10 seconds + } + ) {} + + async execute(operation: () => Promise): Promise { + if (this.state === CircuitState.OPEN) { + if (this.shouldAttemptReset()) { + this.state = CircuitState.HALF_OPEN + } else { + throw new Error('Circuit breaker is OPEN - operation not allowed') + } + } + + try { + const result = await operation() + this.onSuccess() + return result + } catch (error) { + this.onFailure() + throw error + } + } + + private onSuccess (): void { + this.failureCount = 0 + if (this.state === CircuitState.HALF_OPEN) { + this.state = CircuitState.CLOSED + } + this.successCount++ + } + + private onFailure (): void { + this.failureCount++ + this.lastFailureTime = Date.now() + + if (this.failureCount >= this.options.failureThreshold) { + this.state = CircuitState.OPEN + } + } + + private shouldAttemptReset (): boolean { + return ( + this.lastFailureTime != null && + Date.now() - this.lastFailureTime >= this.options.resetTimeout + ) + } + + getState (): CircuitState { + return this.state + } + + getStats (): { state: CircuitState, failureCount: number, successCount: number, lastFailureTime?: number } { + return { + state: this.state, + failureCount: this.failureCount, + successCount: this.successCount, + lastFailureTime: this.lastFailureTime + } + } +} + +/** + * Retry with exponential backoff + */ +export async function retryWithBackoff ( + operation: () => Promise, + maxRetries: number = 3, + initialDelay: number = 1000, + maxDelay: number = 10000 +): Promise { + let lastError: Error | undefined + + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + return await operation() + } catch (error) { + lastError = error as Error + + if (attempt === maxRetries) { + break + } + + const delay = Math.min( + initialDelay * Math.pow(2, attempt - 1), + maxDelay + ) + + console.warn(`Operation failed (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms:`, error) + await new Promise(resolve => setTimeout(resolve, delay)) + } + } + + throw lastError ?? new Error('Operation failed after all retry attempts') +} diff --git a/src/utils/ErrorMonitor.ts b/src/utils/ErrorMonitor.ts new file mode 100644 index 00000000..c26b9d4a --- /dev/null +++ b/src/utils/ErrorMonitor.ts @@ -0,0 +1,163 @@ +/** + * Enhanced error monitoring and alerting system + */ + +export interface ErrorStats { + totalErrors: number + errorsByType: Map + recentErrors: Array<{ + timestamp: Date + error: string + type: string + stack?: string + }> + lastReset: Date +} + +export class ErrorMonitor { + private stats: ErrorStats = { + totalErrors: 0, + errorsByType: new Map(), + recentErrors: [], + lastReset: new Date() + } + + private readonly maxRecentErrors = 100 + private readonly alertThresholds = { + errorsPerMinute: 10, + totalErrors: 50 + } + + logError (error: Error, type: string = 'unknown', context?: any): void { + this.stats.totalErrors++ + + // Track errors by type + const currentCount = this.stats.errorsByType.get(type) ?? 0 + this.stats.errorsByType.set(type, currentCount + 1) + + // Add to recent errors + this.stats.recentErrors.push({ + timestamp: new Date(), + error: error.message, + type, + stack: error.stack + }) + + // Keep only recent errors + if (this.stats.recentErrors.length > this.maxRecentErrors) { + this.stats.recentErrors = this.stats.recentErrors.slice(-this.maxRecentErrors) + } + + // Log the error with context + console.error(`[${type}] Error:`, { + message: error.message, + stack: error.stack, + context, + timestamp: new Date().toISOString() + }) + + // Check if we need to alert + this.checkAlertThresholds() + } + + logGraphQLError (error: any, query?: string, variables?: any): void { + const errorType = error.extensions?.code ?? 'GRAPHQL_ERROR' + this.logError(error, errorType, { query, variables }) + } + + logNetworkError (error: Error, url?: string, method?: string): void { + this.logError(error, 'NETWORK_ERROR', { url, method }) + } + + logDatabaseError (error: Error, operation?: string): void { + this.logError(error, 'DATABASE_ERROR', { operation }) + } + + private checkAlertThresholds (): void { + const now = new Date() + const oneMinuteAgo = new Date(now.getTime() - 60000) + + // Count errors in the last minute + const recentErrorCount = this.stats.recentErrors.filter( + err => err.timestamp > oneMinuteAgo + ).length + + if (recentErrorCount >= this.alertThresholds.errorsPerMinute) { + console.error(`🚨 HIGH ERROR RATE ALERT: ${recentErrorCount} errors in the last minute`) + this.logSystemStatus() + } + + if (this.stats.totalErrors >= this.alertThresholds.totalErrors) { + console.error(`🚨 HIGH TOTAL ERROR COUNT: ${this.stats.totalErrors} total errors since ${this.stats.lastReset.toISOString()}`) + } + } + + private logSystemStatus (): void { + const memUsage = process.memoryUsage() + const uptime = process.uptime() + + console.log('System Status:', { + uptime: `${Math.round(uptime / 60)} minutes`, + memory: { + heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, + heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, + rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB` + }, + errors: { + total: this.stats.totalErrors, + byType: Object.fromEntries(this.stats.errorsByType) + } + }) + } + + getStats (): ErrorStats { + return { + ...this.stats, + errorsByType: new Map(this.stats.errorsByType), + recentErrors: [...this.stats.recentErrors] + } + } + + reset (): void { + this.stats = { + totalErrors: 0, + errorsByType: new Map(), + recentErrors: [], + lastReset: new Date() + } + } + + // Get health status based on error rates + getHealthStatus (): 'healthy' | 'warning' | 'critical' { + const now = new Date() + const oneMinuteAgo = new Date(now.getTime() - 60000) + const fiveMinutesAgo = new Date(now.getTime() - 300000) + + const errorsLastMinute = this.stats.recentErrors.filter( + err => err.timestamp > oneMinuteAgo + ).length + + const errorsLastFiveMinutes = this.stats.recentErrors.filter( + err => err.timestamp > fiveMinutesAgo + ).length + + if (errorsLastMinute >= 10) return 'critical' + if (errorsLastFiveMinutes >= 20) return 'warning' + return 'healthy' + } +} + +// Global error monitor instance +export const errorMonitor = new ErrorMonitor() + +// Setup global error handlers +export function setupGlobalErrorHandlers (): void { + process.on('uncaughtException', (error) => { + errorMonitor.logError(error, 'UNCAUGHT_EXCEPTION') + }) + + process.on('unhandledRejection', (reason, promise) => { + const error = reason instanceof Error ? reason : new Error(String(reason)) + errorMonitor.logError(error, 'UNHANDLED_REJECTION', { promise }) + }) +}