diff --git a/development/backend/cron.mdx b/development/backend/cron.mdx new file mode 100644 index 0000000..d8de628 --- /dev/null +++ b/development/backend/cron.mdx @@ -0,0 +1,230 @@ +--- +title: Cron Job Scheduling +description: Schedule recurring background tasks using cron expressions with the DeployStack job queue system. +sidebarTitle: Cron +--- + +# Cron Job Scheduling + +DeployStack includes a cron job scheduling system that integrates seamlessly with the [Background Job Queue](/development/backend/job-queue). This allows you to schedule recurring tasks using standard cron expressions, with all the benefits of the job queue system including persistence, retries, and monitoring. + +## Architecture + +The cron system follows a two-tier architecture: + +1. **Cron Scheduler**: Uses `node-cron` to schedule tasks based on cron expressions +2. **Job Queue**: Processes the actual work with persistence and retry capabilities + +``` +Cron Expression → Scheduler fires → Creates job in queue → Worker processes job +``` + +This separation provides: +- **Reliability**: Jobs persist even if the server restarts +- **Visibility**: All jobs are logged and trackable in the database +- **Rate Limiting**: Built-in queue management prevents system overload +- **Monitoring**: Track success/failure rates and execution history + +## Creating a Cron Job + +### Step 1: Define the Cron Job + +Create a new file in `src/cron/jobs/`: + +```typescript +// src/cron/jobs/dailyCleanup.ts +import type { CronJob } from '../cronManager'; +import type { JobQueueService } from '../../services/jobQueueService'; + +export function createDailyCleanupJob(jobQueueService: JobQueueService): CronJob { + return { + name: 'daily-cleanup', + schedule: '0 2 * * *', // Every day at 2 AM + + task: async () => { + await jobQueueService.createJob('cleanup_old_data', { + daysToKeep: 30 + }); + } + }; +} +``` + +### Step 2: Create the Worker + +Create a worker to process the job in `src/workers/`: + +```typescript +// src/workers/cleanupWorker.ts +import type { AnyDatabase } from '../db'; +import type { FastifyBaseLogger } from 'fastify'; +import type { Worker, WorkerResult } from './types'; + +interface CleanupPayload { + daysToKeep: number; +} + +export class CleanupWorker implements Worker { + constructor( + private readonly db: AnyDatabase, + private readonly logger: FastifyBaseLogger + ) {} + + async execute(payload: unknown, jobId: string): Promise { + const { daysToKeep } = payload as CleanupPayload; + + this.logger.info({ + jobId, + daysToKeep, + operation: 'cleanup_old_data' + }, 'Starting cleanup job'); + + try { + // Your cleanup logic here + const cutoffDate = new Date(); + cutoffDate.setDate(cutoffDate.getDate() - daysToKeep); + + // Example: Delete old records + // const result = await this.db.delete(oldRecordsTable) + // .where(lt(oldRecordsTable.createdAt, cutoffDate)); + + this.logger.info({ + jobId, + operation: 'cleanup_old_data' + }, 'Cleanup completed successfully'); + + return { + success: true, + message: 'Cleanup completed successfully' + }; + } catch (error) { + this.logger.error({ jobId, error }, 'Cleanup job failed'); + throw error; // Triggers retry logic + } + } +} +``` + +### Step 3: Register the Worker + +Add the worker to `src/workers/index.ts`: + +```typescript +import { CleanupWorker } from './cleanupWorker'; + +export function registerWorkers( + processor: JobProcessorService, + db: AnyDatabase, + logger: FastifyBaseLogger +): void { + // ... existing workers ... + + processor.registerWorker( + 'cleanup_old_data', + new CleanupWorker(db, logger) + ); +} +``` + +### Step 4: Register the Cron Job + +Add the cron job to `src/cron/index.ts`: + +```typescript +import { createDailyCleanupJob } from './jobs/dailyCleanup'; + +export function initializeCronJobs( + jobQueueService: JobQueueService, + logger: FastifyBaseLogger +): CronManager { + const cronManager = new CronManager(logger); + + cronManager.register(createDailyCleanupJob(jobQueueService)); + + return cronManager; +} +``` + +## Cron Expression Syntax + +The system uses standard cron syntax with 5 or 6 fields: + +``` +┌────────────── second (optional, 0-59) +│ ┌──────────── minute (0-59) +│ │ ┌────────── hour (0-23) +│ │ │ ┌──────── day of month (1-31) +│ │ │ │ ┌────── month (1-12) +│ │ │ │ │ ┌──── day of week (0-7, 0 or 7 = Sunday) +│ │ │ │ │ │ +* * * * * * +``` + +### Common Examples + +```typescript +'*/2 * * * *' // Every 2 minutes +'0 * * * *' // Every hour (at minute 0) +'0 0 * * *' // Daily at midnight +'0 2 * * *' // Daily at 2 AM +'0 9 * * 1-5' // Weekdays at 9 AM +'*/30 * * * *' // Every 30 minutes +'0 */6 * * *' // Every 6 hours +'0 0 1 * *' // First day of every month +'0 0 * * 0' // Every Sunday at midnight +``` + +## Integration with Job Queue + +The cron system is designed to work with the job queue system. This provides several benefits: + +**Persistence**: Jobs created by cron are stored in the database and survive server restarts. + +**Retry Logic**: Failed jobs are automatically retried with exponential backoff. + +**Rate Limiting**: The job queue processes jobs sequentially, preventing system overload. + +**Monitoring**: Track job execution history, success rates, and failures. + +For more details on the job queue system, see the [Background Job Queue](/development/backend/job-queue) documentation. + +## Example: Complete Implementation + +Here's a complete example showing how to create a cron job that sends a daily email digest: + +```typescript +// src/cron/jobs/dailyDigest.ts +import type { CronJob } from '../cronManager'; +import type { JobQueueService } from '../../services/jobQueueService'; + +export function createDailyDigestJob(jobQueueService: JobQueueService): CronJob { + return { + name: 'daily-digest-email', + schedule: '0 8 * * *', // Every day at 8 AM + + task: async () => { + // Create job to send digest email + await jobQueueService.createJob('send_email', { + to: 'admin@example.com', + subject: 'Daily Activity Digest', + template: 'daily_digest', + variables: { + date: new Date().toISOString().split('T')[0] + } + }); + } + }; +} +``` + +The `send_email` worker (already registered in the system) will process this job using the existing [Email System](/development/backend/mail). + +## Lifecycle Management + +The cron system is automatically initialized during server startup and gracefully shut down when the server stops: + +**Startup**: All registered cron jobs are scheduled and begin running according to their expressions. + +**Shutdown**: When the server receives a shutdown signal, cron jobs stop creating new jobs, allowing the job queue to finish processing existing jobs. + +This ensures no jobs are lost during server restarts or deployments. diff --git a/development/backend/index.mdx b/development/backend/index.mdx index 20fd76a..bc52b46 100644 --- a/development/backend/index.mdx +++ b/development/backend/index.mdx @@ -112,6 +112,14 @@ The development server starts at `http://localhost:3000` with API documentation Database-backed job processing system with persistent storage, automatic retries, and rate limiting for long-running background tasks. + + Schedule recurring background tasks using cron expressions integrated with the job queue system. + + { + const { startTime, endTime, interval, filters } = params; + + const startTimestamp = Math.floor(startTime.getTime() / 1000); + const endTimestamp = Math.floor(endTime.getTime() / 1000); + + // Build query with filters + const whereConditions = [ + eq(serverInstallMetrics.user_id, filters.user_id), + eq(serverInstallMetrics.team_id, filters.team_id), + eq(serverInstallMetrics.bucket_interval, interval), + gte(serverInstallMetrics.bucket_timestamp, startTimestamp), + lte(serverInstallMetrics.bucket_timestamp, endTimestamp) + ]; + + // Optional filters + if (filters.server_id) { + whereConditions.push(eq(serverInstallMetrics.server_id, filters.server_id)); + } + + if (filters.satellite_id) { + whereConditions.push(eq(serverInstallMetrics.satellite_id, filters.satellite_id)); + } + + // Query with aggregation (handle multiple satellites/servers per bucket) + const results = await this.db + .select({ + bucket_timestamp: serverInstallMetrics.bucket_timestamp, + installation_count: sql`SUM(${serverInstallMetrics.installation_count})`, + uninstallation_count: sql`SUM(${serverInstallMetrics.uninstallation_count})` + }) + .from(serverInstallMetrics) + .where(and(...whereConditions)) + .groupBy(serverInstallMetrics.bucket_timestamp) + .orderBy(serverInstallMetrics.bucket_timestamp); + + return results.map(row => ({ + timestamp: row.bucket_timestamp, + installation_count: row.installation_count, + uninstallation_count: row.uninstallation_count + })); + } + + // Public method for API endpoint + async getServerInstallMetrics( + userId: string, + teamId: string, + timeRange: string, + interval: string, + serverId?: string, + satelliteId?: string + ): Promise { + // Parse time range + const { start, end } = this.parseTimeRange(timeRange); + + // Validate interval + this.validateInterval(interval); + + // Generate bucket timestamps + const timestamps = this.generateBucketTimestamps(start, end, interval); + + // Build filters + const filters: Record = { + user_id: userId, + team_id: teamId + }; + + if (serverId) filters.server_id = serverId; + if (satelliteId) filters.satellite_id = satelliteId; + + // Query actual data + const buckets = await this.queryBuckets({ + startTime: start, + endTime: end, + interval, + filters + }); + + // Fill missing buckets with zeros + const filledBuckets = this.fillMissingBuckets(buckets, timestamps); + + // Calculate summary statistics + const summary = this.calculateSummary(filledBuckets); + + // Format standardized response + return this.formatResponse({ + metricType: this.getMetricType(), + timeRange: { start, end, interval }, + filters, + buckets: filledBuckets, + summary + }); + } +} +``` + +**Key Points**: +- Extends `TimeSeriesMetricsService` +- Implements `getMetricType()` and `queryBuckets()` +- Uses `SUM()` and `GROUP BY` for aggregation +- Handles optional filters (server_id, satellite_id, etc.) +- Orchestrates full flow in public method + +### Step 5: Add Permissions + +Add the metric permission to `src/permissions/index.ts`: + +```typescript +export const ROLE_DEFINITIONS = { + global_user: [ + // ... existing permissions + 'metrics.server_install_metrics.view', // Add your metric permission + ], + global_admin: [ + // ... existing permissions + 'metrics.server_install_metrics.view', // Also add to admin if needed + ], + // ... other roles +} as const; +``` + +**Permission Naming Convention**: `metrics..view` + +For detailed information about the permission system, role hierarchy, and access control patterns, see [Role Management](/development/backend/roles). + +### Step 6: Create API Endpoint + +Create your API endpoint following the established patterns: + +**File Structure**: `src/routes/users/me/metrics/your-metric.ts` + +**Key Requirements**: +- Use `preValidation` for authorization (requires permission check) +- Accept query parameters: `team_id` (required), `time_range`, `interval`, optional filters +- Return standardized time-series response from service +- Manual JSON serialization with `JSON.stringify()` + +**Reference Implementation**: `src/routes/users/me/metrics/mcp/client-activity.ts` + +**Complete Patterns**: +- Authorization: [API Security Best Practices](/development/backend/api/security) +- Request/Response schemas: [API Documentation](/development/backend/api/) +- Query parameters: Use reusable schema constants pattern from reference implementation + +**Basic Handler Structure**: +```typescript +export default async function yourMetricsRoute(server: FastifyInstance) { + server.get('/users/me/metrics/your-metric', { + preValidation: requirePermission('metrics.your_table_name.view'), + schema: { + tags: ['Metrics'], + summary: 'Get your metrics', + security: [{ cookieAuth: [] }], + querystring: QUERY_PARAMS_SCHEMA, // Define your schema constants + response: { + 200: SUCCESS_RESPONSE_SCHEMA, + 400: ERROR_RESPONSE_SCHEMA, + 401: ERROR_RESPONSE_SCHEMA, + 403: ERROR_RESPONSE_SCHEMA + } + } + }, async (request, reply) => { + const userId = request.user!.id; + const query = request.query as QueryParams; + + const db = getDb(); + const metricsService = new YourMetricsService(db); + + const result = await metricsService.getYourMetrics( + userId, + query.team_id, + query.time_range || '24h', + query.interval || '1h' + // ... optional filters + ); + + const response = { success: true, data: result.data }; + const jsonString = JSON.stringify(response); + return reply.status(200).type('application/json').send(jsonString); + }); +} +``` + +**Register the route** in `src/routes/users/me/metrics/index.ts`. + +### Step 7: Set Up Data Collection + +Write data to your metrics table from the appropriate source. Common patterns: + +#### Pattern A: Event Handler (Real-Time Collection) + +For real-time metrics collection from satellite events, follow the `mcp-client-activity.ts` event handler pattern: +- Calculate bucket timestamps for your intervals (15m, 1h) +- Use UPSERT with composite unique constraint +- Write to multiple bucket intervals simultaneously +- Non-fatal error handling (log errors, don't block event processing) + +**Reference**: `src/events/satellite/mcp-client-activity.ts` + +#### Pattern B: Cron Job (Periodic Aggregation) + +For periodic aggregation from existing data, use the cron + worker pattern: +- Create cron job definition in `src/cron/jobs/` +- Create worker in `src/workers/` to implement aggregation logic +- Register both in `src/cron/index.ts` and `src/workers/index.ts` + +For detailed patterns, see [Cron Scheduling](/development/backend/cron) and [Background Job Queue](/development/backend/job-queue). + +### Step 8: Add Cleanup System + +Create a cleanup worker and cron job following the MCP client activity cleanup pattern: + +**Key Components**: +- Cleanup worker in `src/workers/` implementing the `Worker` interface +- Cron job definition in `src/cron/jobs/` scheduling the cleanup +- Registration in `src/workers/index.ts` and `src/cron/index.ts` + +**Cleanup Pattern**: +- Calculate cutoff timestamp based on retention period +- Delete old buckets using time-based index: `lt(table.bucket_timestamp, cutoffTimestamp)` +- Handle both database drivers: `(result.changes || result.rowsAffected || 0)` +- Log deletion statistics for monitoring + +**Reference Implementation**: +- Worker: `src/workers/mcpClientActivityMetricsCleanupWorker.ts` +- Cron Job: `src/cron/jobs/mcpClientActivityMetricsCleanup.ts` + +For detailed patterns and scheduling, see [Cron Scheduling](/development/backend/cron) and [Background Job Queue](/development/backend/job-queue). + +## Metrics-Specific Patterns + +### Bucket Timestamp Calculation + +Always round down to bucket boundaries for alignment: + +```typescript +// Unix seconds, not milliseconds +const unixSeconds = Math.floor(eventTime.getTime() / 1000); + +// Round down to bucket start +const bucketStart = Math.floor(unixSeconds / intervalSeconds) * intervalSeconds; + +// Example: 10:07:30 with 15m interval (900s) +// floor(1736512050 / 900) * 900 = 1736512000 (10:00:00) +``` + +### Service Aggregation Pattern + +Always aggregate multiple records per bucket using `SUM()` and `GROUP BY`: + +```typescript +// CORRECT - Aggregates multiple satellites/servers per bucket +const results = await this.db + .select({ + bucket_timestamp: metrics.bucket_timestamp, + total_count: sql`SUM(${metrics.count})` + }) + .from(metrics) + .where(conditions) + .groupBy(metrics.bucket_timestamp) + .orderBy(metrics.bucket_timestamp); +``` + +### Database Driver Compatibility + +Handle both SQLite (`changes`) and Turso (`rowsAffected`): + +```typescript +const deletedCount = (result.changes || result.rowsAffected || 0); +``` + +For more details, see [Database Driver Compatibility](/development/backend/database/#database-driver-compatibility). + + + +## Common Pitfalls + +### ❌ Using milliseconds instead of seconds +```typescript +// WRONG +const timestamp = Date.now(); // Returns milliseconds + +// CORRECT +const timestamp = Math.floor(Date.now() / 1000); // Convert to seconds +``` + +### ❌ Not aggregating buckets +```typescript +// WRONG - Returns multiple rows per bucket +const results = await db.select() + .from(metrics) + .where(conditions); + +// CORRECT - Aggregates multiple records per bucket +const results = await db.select({ + bucket_timestamp: metrics.bucket_timestamp, + total_count: sql`SUM(${metrics.count})` +}) +.from(metrics) +.where(conditions) +.groupBy(metrics.bucket_timestamp); +``` + +### ❌ Not handling both database drivers +```typescript +// WRONG - Only works with SQLite +const deletedCount = result.changes; + +// CORRECT - Works with both SQLite and Turso +const deletedCount = (result.changes || result.rowsAffected || 0); +``` + +## Related Documentation + +- [Database Management](/development/backend/database/) - Schema design, migrations, Drizzle ORM +- [API Security](/development/backend/api/security) - Authorization patterns and security best practices +- [Role Management](/development/backend/roles) - Permission system details +- [Background Job Queue](/development/backend/job-queue) - Worker and job queue system +- [Cron Scheduling](/development/backend/cron) - Scheduled task management + +## Summary + +Adding a new metric type involves: + +1. **Define requirements** (counters, dimensions, intervals, retention) +2. **Create database table** with proper indexes and constraints +3. **Generate and apply migration** +4. **Create metric service** extending `TimeSeriesMetricsService` +5. **Add permissions** to roles system +6. **Create API endpoint** with security checks +7. **Set up data collection** (event handlers or cron jobs) +8. **Add cleanup system** (worker + cron job) + +The generic infrastructure handles bucket generation, gap filling, and response formatting - you only write table-specific query logic. Follow the MCP Client Activity Metrics implementation as your reference template. diff --git a/development/satellite/architecture.mdx b/development/satellite/architecture.mdx index 9530e82..87352f7 100644 --- a/development/satellite/architecture.mdx +++ b/development/satellite/architecture.mdx @@ -378,43 +378,6 @@ Configuration → Spawn → Monitor → Health Check → Restart/Terminate - **Automatic Recovery**: Restart failed processes - **Resource Limits**: Enforce team quotas -## Development Roadmap - -### Phase 1: MCP Transport Implementation ✅ COMPLETED -- **SSE Transport**: Server-Sent Events with session management -- **SSE Messaging**: JSON-RPC message sending via sessions -- **Streamable HTTP**: Direct HTTP communication with optional streaming -- **Session Management**: Cryptographically secure session handling -- **JSON-RPC 2.0**: Full protocol compliance with error handling - -### Phase 2: MCP Server Process Management ✅ COMPLETED -- **Process Lifecycle**: Spawn, monitor, terminate MCP servers with auto-restart -- **stdio Communication**: JSON-RPC 2.0 over stdin/stdout with buffer-based parsing -- **Tool Discovery**: Discover and cache tools from stdio MCP servers -- **Health Monitoring**: Process health checks and crash detection -- **Auto-Restart**: Max 3 attempts with exponential backoff, then permanently_failed status -- **Team-Aware Reporting**: processes_by_team in heartbeat every 30 seconds - -### Phase 3: Team Isolation -- **Resource Boundaries**: CPU and memory limits -- **Process Isolation**: Namespaces and process groups -- **Filesystem Isolation**: Team-specific directories -- **Credential Management**: Secure environment injection - -### Phase 4: Backend Integration ✅ COMPLETED -- **HTTP Polling**: Communication with DeployStack Backend -- **Configuration Sync**: Dynamic configuration updates -- **Status Reporting**: Real-time metrics and health -- **Command Processing**: Execute Backend commands - -For detailed information about the polling implementation, see [Backend Polling Implementation](/development/satellite/polling). - -### Phase 5: Enterprise Features -- **OAuth 2.1 Authentication**: Full authentication server -- **HTTP Proxy**: External MCP server proxying -- **Advanced Monitoring**: Comprehensive observability -- **Multi-Region Support**: Global deployment - ## Technical Implementation Details ### Current Implementation Specifications diff --git a/development/satellite/backend-communication.mdx b/development/satellite/backend-communication.mdx index 10698b8..4f697c8 100644 --- a/development/satellite/backend-communication.mdx +++ b/development/satellite/backend-communication.mdx @@ -67,77 +67,6 @@ The satellite uses three distinct communication channels with the Backend: For detailed event system documentation, see [Event System](/development/satellite/event-system). -## Current Implementation - -### Phase 1: Basic Connection Testing ✅ - -The satellite currently implements basic Backend connectivity: - -**Environment Configuration:** -```bash -# .env file -DEPLOYSTACK_BACKEND_URL=http://localhost:3000 -``` - -**Backend Client Service:** -- Connection testing with 5-second timeout -- Health endpoint validation at `/api/health` -- Structured error responses with timing metrics -- Last connection status and response time tracking - -**Fail-Fast Startup Logic:** -```typescript -const connectionStatus = await backendClient.testConnection(); -if (connectionStatus.connection_status === 'connected') { - server.log.info('✅ Backend connection verified'); -} else { - server.log.error('❌ Backend unreachable - satellite cannot start'); - process.exit(1); -} -``` - -**Debug Endpoint:** -- `GET /api/status/backend` - Returns connection status for troubleshooting - -### Phase 2: Satellite Registration ✅ - -Satellite registration is now fully implemented with secure JWT-based token authentication preventing unauthorized satellite connections. - -For complete registration documentation, see [Satellite Registration](/development/satellite/registration). For backend token management details, see [Registration Token Authentication](/development/backend/api/security#registration-token-authentication). - -### Phase 3: Heartbeat Authentication ✅ - -**API Key Authentication:** -- Bearer token authentication implemented for heartbeat requests -- API key validation using argon2 hash verification -- Automatic key rotation on satellite re-registration - -**Heartbeat Implementation:** -- 30-second interval heartbeat reporting -- System metrics collection (CPU, memory, uptime) -- Process status reporting (empty array for now) -- Authenticated communication with Backend - -### Phase 4: Command Polling ✅ - -**Command Polling Implementation:** -- Adaptive polling intervals based on command priorities -- Command queue processing with immediate, high, and normal priorities -- Status reporting and acknowledgment system -- Automatic polling mode switching based on pending commands - -**Priority-Based Polling:** -- `immediate` priority commands trigger 2-second polling intervals -- `high` priority commands trigger 10-second polling intervals -- `normal` priority commands trigger 30-second polling intervals -- No pending commands default to 60-second polling intervals - -**Command Processing:** -- MCP installation commands trigger configuration refresh -- MCP deletion commands trigger process cleanup -- System update commands trigger component updates -- Command completion reporting with correlation IDs - ## Communication Components ### Command Polling diff --git a/development/satellite/event-system.mdx b/development/satellite/event-system.mdx index 63c4712..a74bf76 100644 --- a/development/satellite/event-system.mdx +++ b/development/satellite/event-system.mdx @@ -66,7 +66,7 @@ Satellite Components EventBus Backend **Naming Convention**: All event data fields use **snake_case** (e.g., `server_id`, `team_id`, `spawn_duration_ms`) to match the backend API convention. -The satellite emits 10 event types across 4 categories: +The satellite emits 12 event types across 4 categories: ### MCP Server Lifecycle @@ -148,6 +148,46 @@ Emitted when server exhausts all 3 restart attempts. } ``` +#### `mcp.server.dormant` +Emitted when idle stdio process is terminated to save resources. + +**Data Structure:** +```typescript +{ + server_id: string; + server_slug: string; + team_id: string; + process_id: number; + idle_duration_seconds: number; + last_activity_at: string; // ISO 8601 timestamp +} +``` + +**Purpose:** +- Track resource optimization (memory/CPU savings) +- Monitor process sleep patterns +- Alert on unexpected idle behavior + +#### `mcp.server.respawned` +Emitted when dormant process is automatically respawned on API call. + +**Data Structure:** +```typescript +{ + server_id: string; + server_slug: string; + team_id: string; + process_id: number; + dormant_duration_seconds: number; + respawn_duration_ms: number; +} +``` + +**Purpose:** +- Track transparent process respawning +- Measure respawn latency (1-3s typical) +- Monitor usage patterns after idle periods + ### Client Connections #### `mcp.client.connected` diff --git a/development/satellite/idle-process-management.mdx b/development/satellite/idle-process-management.mdx new file mode 100644 index 0000000..59645ed --- /dev/null +++ b/development/satellite/idle-process-management.mdx @@ -0,0 +1,472 @@ +--- +title: Idle Process Management +description: Automatic termination and respawning of idle stdio subprocess MCP servers to optimize memory usage and resource utilization in DeployStack Satellite. +--- + +# Idle Process Management + +DeployStack Satellite implements intelligent idle process management for stdio subprocess MCP servers. This system automatically terminates processes that remain inactive for extended periods and respawns them on-demand, optimizing memory usage while maintaining instant availability for users. + + +**Purpose**: Idle process management reduces memory consumption from constantly running MCP servers by terminating inactive processes. When a client needs a dormant server, the system automatically respawns it within 1-3 seconds, providing a balance between resource efficiency and user experience. + + +## Overview + +Idle process management works through three coordinated systems: + +- **Idle Process Cleanup Job**: Monitors running processes and terminates those exceeding idle timeout +- **Dormant State Tracking**: RuntimeState maintains configurations of terminated processes for quick respawning +- **Automatic Respawning**: ProcessManager respawns dormant processes when clients request them + +## Idle Detection & Termination + +### Idle Timeout Configuration + +Processes are considered idle based on inactivity duration: + +**Default Timeout**: 180 seconds (3 minutes) + +**Configuration**: +```bash +# Set custom idle timeout (in seconds) +export MCP_PROCESS_IDLE_TIMEOUT_SECONDS=300 # 5 minutes +``` + +**Activity Tracking**: +- `lastActivity` timestamp updated on every message sent or received +- Idle duration calculated as: `now - lastActivity` +- Only stdio transport processes are subject to idle termination + +### Idle Process Cleanup Job + +The cleanup job runs automatically every 30 seconds: + +**Operation Flow**: +1. Retrieve all running stdio processes from ProcessManager +2. Check each process against idle criteria +3. Terminate idle processes and mark as dormant +4. Update RuntimeState with dormant configurations + +**Idle Criteria Checks**: +- Process status must be 'running' (skips 'starting', 'terminating', etc.) +- Process age must exceed spawn grace period (60 seconds) +- No active requests in flight +- Idle duration exceeds configured timeout (180 seconds default) + +### Spawn Grace Period + +Newly spawned processes receive protection from immediate termination: + +**Grace Period**: 60 seconds (configurable) + +**Configuration**: +```bash +# Set custom grace period (in seconds) +export MCP_PROCESS_SPAWN_GRACE_PERIOD_SECONDS=90 +``` + +**Protection Rules**: +- Processes younger than grace period cannot be marked idle +- Allows time for MCP handshake completion +- Prevents termination during tool discovery +- Ensures processes become fully operational before idle monitoring begins + + +**Grace Period Purpose**: Without the grace period, processes could be terminated during initialization, causing race conditions where the handshake completes but the process is immediately marked idle and terminated. The 60-second default provides ample time for handshake, tool discovery, and initial activity. + + +### Termination Process + +When a process exceeds the idle timeout: + +**Steps**: +1. Log idle duration and last activity timestamp +2. Store process configuration in dormant map (RuntimeState) +3. Emit `mcp.server.dormant` event to Backend +4. Execute graceful process termination +5. **Tools remain cached** for fast respawn (not cleared) +6. Remove from active process tracking maps + +**Event Emission**: +```typescript +{ + type: 'mcp.server.dormant', + data: { + server_id: string, + server_slug: string, + team_id: string, + process_id: number, + idle_duration_seconds: number, + last_activity_at: string (ISO 8601) + } +} +``` + +## Dormant State Management + +### Dormant Configuration Storage + +RuntimeState maintains a separate map of dormant process configurations: + +**Stored Information**: +- Complete MCPServerConfig (command, args, env, installation details) +- Allows identical respawn without Backend communication +- Remains in memory until process respawns or satellite restarts + +**Map Structure**: +```typescript +// Installation name → MCPServerConfig +Map +``` + +### Dormant vs Active Tracking + +**Active Processes**: +- Tracked in ProcessManager maps (by ID, by name) +- Have active ProcessInfo with status, metrics, handlers +- Consume memory for process overhead and buffers + +**Dormant Processes**: +- Only configuration stored in RuntimeState +- No active process or handlers +- Minimal memory footprint (~1-2KB per config) +- **Tools remain in cache** for instant availability + +### Dormant Process Queries + +RuntimeState provides methods for dormant process inspection: + +**Query Methods**: +- `getDormantConfig(installationName)`: Retrieve specific config +- `getDormantCount()`: Count total dormant processes +- `getAllDormantConfigs()`: List all dormant configurations + +**Heartbeat Reporting**: +- Dormant count included in heartbeat data +- Enables Backend visibility into idle process management +- Tracks dormant vs active process ratio + +## Automatic Respawning + +### Respawn Trigger + +Dormant processes respawn automatically when clients request them: + +**Trigger Points**: +1. MCP client calls `tools/list` and process is dormant +2. MCP client calls `tools/call` for tool on dormant server +3. Any MCP request targeting dormant installation name + +**Detection**: +```typescript +// ProcessManager checks for dormant config +const dormantConfig = runtimeState.getDormantConfig(installationName); +if (dormantConfig) { + // Respawn process +} +``` + +### Respawn Process Flow + +The respawn process follows the same path as initial spawn: + +``` +Request → Check Active → Check Dormant → Spawn Process → Handshake → Ready (tools already cached) + │ │ │ │ │ │ + Client Not Found Config Found child_process Initialize Serve +``` + +**Timing**: +- Respawn duration: 1-2 seconds (faster - no tool discovery needed) +- Includes handshake only (tools already cached) +- Client experiences minimal delay on first request after dormancy + +### Concurrent Respawn Prevention + +Multiple concurrent requests to the same dormant process are handled safely: + +**Respawn Lock Mechanism**: +- First request initiates respawn and stores Promise in map +- Subsequent requests await the same Promise +- All requests resolve when respawn completes +- Prevents duplicate spawning of same process + +**Implementation**: +```typescript +// ProcessManager tracks in-progress respawns +private respawningProcesses = new Map>(); +``` + +### Post-Respawn Cleanup + +After successful respawn: + +**Cleanup Operations**: +1. Remove configuration from dormant map +2. Add ProcessInfo to active tracking maps +3. Emit `mcp.server.respawned` event +4. **Tools already cached** (no rediscovery needed) +5. Remove respawn Promise from tracking + +**Event Emission**: +```typescript +{ + type: 'mcp.server.respawned', + data: { + server_id: string, + server_slug: string, + team_id: string, + process_id: number, + dormant_duration_seconds: number, + respawn_duration_ms: number + } +} +``` + +## Performance Characteristics + +### Memory Savings + +Idle process management provides significant memory benefits: + +**Per-Process Savings**: +- Active process: ~10-20MB (base Node.js + application) +- Dormant config: ~1-2KB (configuration only) +- Reduction: ~99% memory per idle process + +**Example Scenario**: +- 100 MCP servers installed +- 10 actively used (100MB memory) +- 90 dormant (180KB memory) +- Total: ~100MB vs ~1.5GB if all active + +### Timing Impact + +**User Experience**: +- Active process: Instant response (~10-50ms latency) +- Dormant process first request: 1-2 second delay (faster - no tool discovery) +- Subsequent requests: Instant (process remains active) + +**Respawn Timing Breakdown**: +- Process spawn: 500-1000ms +- MCP handshake: 500-1000ms +- Total: 1000-2000ms (tools already cached, no discovery needed) + +### Cleanup Efficiency + +**Job Performance**: +- Runs every 30 seconds +- Checks all processes: \<1ms per process +- Termination overhead: ~100ms per process +- Minimal CPU impact during normal operation + +## Configuration Best Practices + +### Idle Timeout Selection + +Choose timeout based on usage patterns: + +**Short Timeout (60-120 seconds)**: +- High memory constraints +- Predictable usage patterns +- Acceptable respawn delay +- Many infrequently used servers + +**Medium Timeout (180-300 seconds)** (Default): +- Balanced memory vs experience +- Mixed usage patterns +- Occasional bursty activity +- Recommended for most deployments + +**Long Timeout (600+ seconds)**: +- Ample memory available +- Continuous or frequent usage +- Minimal respawn tolerance +- Mission-critical low latency + +### Grace Period Tuning + +Adjust grace period based on environment: + +**Shorter Grace Period (30-45 seconds)**: +- Fast network connections +- Simple MCP servers (quick handshake) +- Aggressive memory optimization + +**Standard Grace Period (60 seconds)** (Default): +- Recommended for production +- Accounts for npx package downloads +- Handles slow network conditions +- Prevents initialization race conditions + +**Longer Grace Period (90-120 seconds)**: +- Slow network environments +- Complex MCP servers (large dependencies) +- Extra safety margin + + +**Monitoring Recommendation**: Track `mcp.server.dormant` and `mcp.server.respawned` events to tune idle timeout. If processes frequently dormant but respawn shortly after, increase timeout. If processes stay dormant for long periods, timeout is well-tuned. + + +## Monitoring & Observability + +### Event-Based Monitoring + +Track idle process management through Backend events: + +**Key Metrics**: +- **mcp.server.dormant**: Count of processes entering dormant state +- **mcp.server.respawned**: Count of successful respawns +- **idle_duration_seconds**: Time process was inactive before termination +- **dormant_duration_seconds**: Time process spent dormant before respawn + +### Log Analysis + +Important log operations to monitor: + +**Idle Detection**: +``` +[DEBUG] Skipping process in grace period: server-name (age: 25s) +[DEBUG] Skipping process with active requests: server-name +[INFO] Idle check: terminated 2 idle process(es) +``` + +**Dormant Marking**: +``` +[INFO] Marking process as dormant due to inactivity: server-name +[INFO] Process marked as dormant and terminated: server-name +``` + +**Respawning**: +``` +[INFO] Respawning dormant process: server-name +[INFO] Dormant process respawned successfully: server-name +``` + +### Heartbeat Data + +Satellite heartbeat includes dormant process information: + +**Reported Metrics**: +- Total active process count +- Total dormant process count +- Processes by status (running, starting, terminating, failed) + +## Error Handling + +### Respawn Failures + +If respawn fails, standard error handling applies: + +**Failure Scenarios**: +- Process spawn error +- Handshake timeout +- Invalid configuration + +**Error Flow**: +1. Respawn attempt logs error +2. Dormant config remains in map +3. Next request triggers another respawn attempt +4. Auto-restart logic applies (3 attempts max) + +### Termination Failures + +Idle termination handles edge cases: + +**Process Not Found**: +- Logs warning (non-critical) +- Process already terminated externally +- Cleanup continues normally + +**Graceful Shutdown Timeout**: +- SIGTERM sent first (10-second wait) +- SIGKILL sent if timeout exceeded +- Force termination ensures cleanup + +## Development Testing + +### Manual Idle Testing + +Test idle process management locally: + +**Force Idle Termination**: +```bash +# Set short idle timeout for testing +export MCP_PROCESS_IDLE_TIMEOUT_SECONDS=30 + +# Start satellite +npm run dev + +# Spawn process via Backend command +# Wait 30 seconds without activity +# Process should terminate and become dormant +``` + +**Test Respawning**: +```bash +# After process is dormant, make MCP request +curl -X POST http://localhost:3001/mcp \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc":"2.0","id":"1","method":"tools/list","params":{}}' + +# Process should respawn automatically +# Check logs for respawn confirmation +``` + +### Grace Period Testing + +Verify grace period protection: + +```bash +# Set very short idle timeout and standard grace period +export MCP_PROCESS_IDLE_TIMEOUT_SECONDS=10 +export MCP_PROCESS_SPAWN_GRACE_PERIOD_SECONDS=60 + +# Spawn process +# Process should NOT be terminated within first 60 seconds +# Even if idle timeout is only 10 seconds +``` + +## Production Considerations + +### Memory Planning + +Calculate expected memory usage: + +**Formula**: +``` +Total Memory = (Active Processes × 15MB) + (Dormant Processes × 2KB) +``` + +**Example**: +- 200 total MCP servers +- 20 actively used (20 × 15MB = 300MB) +- 180 dormant (180 × 2KB = 360KB) +- Total: ~300MB vs ~3GB without idle management + +### Monitoring Alerts + +Configure alerts for idle process health: + +**Alert Conditions**: +- High respawn rate (>10 per minute): Timeout too aggressive +- High dormant count (>80% of total): Consider longer timeout +- Respawn failures: Configuration or resource issues +- Grace period violations: System overload or timing bugs + +### Resource Limits + +Coordinate with nsjail resource limits: + +**Production Limits**: +- Per-process memory: 50MB (nsjail limit) +- System memory: Plan for peak active processes +- Dormant processes: Negligible memory impact + +## Related Documentation + +- [Process Management](/development/satellite/process-management) - Core stdio subprocess management +- [Background Jobs](/development/satellite/background-jobs) - Job system architecture +- [Event System](/development/satellite/event-system) - Event emission and tracking +- [Tool Discovery](/development/satellite/tool-discovery) - How tool caching interacts with dormancy diff --git a/development/satellite/process-management.mdx b/development/satellite/process-management.mdx index c495c82..757d632 100644 --- a/development/satellite/process-management.mdx +++ b/development/satellite/process-management.mdx @@ -113,6 +113,10 @@ All communication uses newline-delimited JSON following JSON-RPC 2.0 specificati ## Process Lifecycle + +**Idle Process Management**: Processes that remain inactive for extended periods are automatically terminated and respawned on-demand to optimize memory usage. See [Idle Process Management](/development/satellite/idle-process-management) for details on automatic termination, dormant state tracking, and respawning. + + ### Lifecycle States **starting:** @@ -140,16 +144,159 @@ All communication uses newline-delimited JSON following JSON-RPC 2.0 specificati ### Graceful Termination -Termination follows a two-phase approach: +Process termination follows a two-phase graceful shutdown approach to ensure clean process exit and proper resource cleanup. + +#### Termination Phases + +**Phase 1: SIGTERM (Graceful Shutdown)** +- Send SIGTERM signal to the process +- Process has 10 seconds (default timeout) to shut down gracefully +- Process can complete in-flight operations and cleanup resources +- Wait for process to exit voluntarily + +**Phase 2: SIGKILL (Force Termination)** +- If process doesn't exit within timeout period +- Send SIGKILL signal to force immediate termination +- Guaranteed process termination (cannot be caught or ignored) +- Used as last resort for unresponsive processes + +#### Termination Types + +The system handles three types of intentional terminations differently: + +**1. Manual Termination** +- Triggered by explicit restart or stop commands +- Status set to `'terminating'` before sending signals +- No auto-restart triggered +- Standard graceful shutdown with SIGTERM → SIGKILL + +**2. Idle/Dormant Termination** +- Triggered by idle timeout (default: 180 seconds of inactivity) +- Process marked with `isDormantShutdown` flag +- Configuration stored in dormant map for fast respawn +- Tools remain cached for instant availability +- No auto-restart triggered (intentional shutdown) +- See [Idle Process Management](/development/satellite/idle-process-management) for details + +**3. Uninstall Termination** +- Triggered when server removed from configuration +- Process marked with `isUninstallShutdown` flag +- Complete cleanup: process, dormant config, tools, restart tracking +- No auto-restart triggered (intentional removal) +- Invoked via `removeServerCompletely()` method + +#### Crash Detection vs Intentional Shutdown + +The system distinguishes between crashes and intentional shutdowns: + +**Crash Detection Logic:** +```typescript +// Process is considered crashed if: +// 1. Exit code is non-zero (e.g., 1, 143) +// 2. Status is NOT 'terminating' +// 3. NOT marked as intentional shutdown (isDormantShutdown or isUninstallShutdown) +const wasCrash = code !== 0 && code !== null && + processInfo.status !== 'terminating' && + !processInfo.isDormantShutdown && + !processInfo.isUninstallShutdown; +``` + +**Why This Matters:** +- SIGTERM exit code is 143 (non-zero) +- Without flags, graceful termination would trigger auto-restart +- Flags prevent unwanted restarts for intentional shutdowns + +#### Cleanup Operations + +During termination, the following cleanup operations occur: + +1. **Active Request Cancellation** + - All pending JSON-RPC requests are rejected + - Active requests map is cleared + - Clients receive termination error + +2. **State Cleanup** + - Remove from processes map (by process ID) + - Remove from processIdsByName map (by installation name) + - Remove from team tracking sets + - Clear dormant config if exists (for uninstall) + +3. **Resource Tracking** + - Restart attempts cleared (for uninstall) + - Respawn promises cleared + - Process metrics finalized + +4. **Event Emission** + - Emit `processTerminated` internal event + - Emit `processExit` with exit code and signal + - Emit `mcp.server.crashed` if crash detected (Backend event) + +#### Complete Server Removal + +The `removeServerCompletely()` method provides comprehensive cleanup for server uninstall: + +**Method Signature:** +```typescript +async removeServerCompletely( + installationName: string, + timeout: number = 10000 +): Promise<{ active: boolean; dormant: boolean }> +``` + +**Operation Flow:** +1. Check for active process + - If found: Set `isUninstallShutdown` flag + - Terminate with graceful shutdown + - Return `active: true` + +2. Check for dormant config + - If found: Remove from dormant map + - Return `dormant: true` + +3. Clear restart tracking + - Delete restart attempts history + - Prevent any future restart attempts + +**Usage Example:** +```typescript +// Called when server removed from configuration +const result = await processManager.removeServerCompletely( + 'sequential-thinking-team-name-abc123' +); + +// Result: { active: true, dormant: false } +// - Active process was terminated +// - No dormant config existed +``` + +**Logging Output:** +``` +INFO: Removing server completely: sequential-thinking-team-name-abc123 +INFO: Terminating active process: sequential-thinking-team-name-abc123 +DEBUG: Sent SIGTERM to sequential-thinking-team-name-abc123 +INFO: Process terminated for uninstall (not a crash) +INFO: Server removed completely (active: true, dormant: false) +``` + +#### Termination Timing + +**Normal Termination:** +- SIGTERM sent: ~1ms +- Process cleanup: 10-500ms (application-dependent) +- Total time: 11-501ms -1. **SIGTERM Phase**: Send graceful shutdown signal -2. **SIGKILL Phase**: Force kill if timeout exceeded (default 10s) +**Forced Termination:** +- SIGTERM sent: ~1ms +- Timeout wait: 10,000ms +- SIGKILL sent: ~1ms +- Immediate kill: ~10ms +- Total time: ~10,012ms -**Cleanup Operations:** -- Cancel all active requests with rejection -- Clear active requests map -- Remove from tracking maps (by ID, by name, by team) -- Emit 'processTerminated' event +**Best Practices:** +- MCP servers should handle SIGTERM gracefully +- Complete in-flight requests within timeout +- Close file handles and network connections +- Exit with code 0 for clean shutdown ## Auto-Restart System @@ -371,6 +518,7 @@ LOG_LEVEL=debug npm run dev ## Related Documentation - [Satellite Architecture Design](/development/satellite/architecture) - Overall system architecture +- [Idle Process Management](/development/satellite/idle-process-management) - Automatic termination and respawning of idle processes - [Tool Discovery Implementation](/development/satellite/tool-discovery) - How tools are discovered from processes - [Team Isolation Implementation](/development/satellite/team-isolation) - Team-based access control - [Backend Communication](/development/satellite/backend-communication) - Integration with Backend commands diff --git a/development/satellite/registration.mdx b/development/satellite/registration.mdx index 9498af7..887d0f3 100644 --- a/development/satellite/registration.mdx +++ b/development/satellite/registration.mdx @@ -439,10 +439,10 @@ open http://localhost:3001/documentation - ✅ All satellites register as `inactive` requiring admin activation **Planned Features:** -- ✅ JWT-based registration token system (Phase 3 complete) +- ✅ JWT-based registration token system - ✅ Bearer token authentication for Backend communication -- 🚧 Admin interface for registration token management (Phase 4) -- 🚧 Satellite client registration token support (Phase 5) +- ✅ Admin interface for registration token management +- ✅ Satellite client registration token support - 🚧 API key rotation and renewal - 🚧 Registration status monitoring and alerts diff --git a/development/satellite/tool-discovery.mdx b/development/satellite/tool-discovery.mdx index 20f9c0e..e109090 100644 --- a/development/satellite/tool-discovery.mdx +++ b/development/satellite/tool-discovery.mdx @@ -60,7 +60,7 @@ Tool discovery operates through three coordinated managers that handle different **StdioToolDiscoveryManager:** - Discovers tools from stdio subprocess MCP servers - Executes discovery after process spawn and handshake -- Automatically clears tools on process termination +- **Tools persist in cache even when processes go dormant** - Tracks tools by server with namespacing ## Discovery Process by Transport Type @@ -97,7 +97,7 @@ Process Spawn → Handshake → Running → Discover Tools → Cache → Auto-Cl 2. MCP handshake completes (initialize + initialized) 3. Discovery triggered automatically after handshake 4. Tools cached with namespacing (`server_slug-tool_name`) -5. Tools cleared automatically on process termination +5. **Tools persist in cache** even when process terminates (for fast respawn) ### Discovery Timing Differences @@ -109,7 +109,7 @@ Process Spawn → Handshake → Running → Discover Tools → Cache → Auto-Cl **stdio (Lazy):** - Discovered after process spawn completes - Tools become available post-handshake -- Process termination removes tools automatically +- **Tools persist even when process goes dormant** (enables fast respawn) ## Tool Caching Strategy @@ -132,7 +132,7 @@ interface UnifiedCachedTool { **Cache Characteristics:** - **Unified Namespace**: Same format across both transport types - **Memory Storage**: No persistent storage or database -- **Automatic Cleanup**: stdio tools removed on process exit +- **Persistent Caching**: stdio tools remain cached even when processes go dormant - **Conflict Prevention**: server_slug ensures unique names ### Namespacing Strategy @@ -217,12 +217,18 @@ Both managers implement graceful failure handling: ### Automatic Cleanup -stdio tools are automatically managed: +stdio tools persist in cache for optimal performance: **Process Lifecycle:** - **Spawn**: Tools discovered after handshake - **Running**: Tools available for execution -- **Terminate**: Tools removed from cache automatically +- **Idle/Dormant**: Process terminated, **tools remain cached** for fast respawn +- **Respawn**: Process restarts automatically, tools already available (no rediscovery) +- **Uninstall**: Tools cleared only when server is explicitly removed + + +**Idle Process Management**: stdio processes that remain inactive for the configured idle timeout (default: 3 minutes) are automatically terminated to save memory. However, **tools remain cached** so when a client requests them, the process respawns instantly without needing to rediscover tools. This reduces respawn time from 1-3 seconds to 1-2 seconds. See [Idle Process Management](/development/satellite/idle-process-management) for details. + ## Development Considerations @@ -275,10 +281,10 @@ curl -X POST http://localhost:3001/mcp \ ### stdio Performance -- **Discovery Time**: 1-2 seconds post-spawn -- **Memory**: ~1KB per tool -- **Overhead**: Single JSON-RPC request per process -- **Caching**: Automatic cleanup on process exit +- **Discovery Time**: 1-2 seconds post-spawn (first time only) +- **Memory**: ~1KB per tool (persists even when process dormant) +- **Overhead**: Single JSON-RPC request per process (cached for respawns) +- **Caching**: Persistent - tools remain even when process goes dormant ### Scalability diff --git a/docs.json b/docs.json index 6086916..3c42616 100644 --- a/docs.json +++ b/docs.json @@ -124,7 +124,9 @@ "/development/backend/global-settings", "/development/backend/job-queue", "/development/backend/mail", - "/development/backend/cloud-credentials" + "/development/backend/cloud-credentials", + "/development/backend/cron", + "/development/backend/metrics" ] }, { @@ -185,7 +187,8 @@ "/development/satellite/mcp-transport", "/development/satellite/process-management", "/development/satellite/team-isolation", - "/development/satellite/tool-discovery" + "/development/satellite/tool-discovery", + "/development/satellite/idle-process-management" ] }, { diff --git a/self-hosted/docker-compose.mdx b/self-hosted/docker-compose.mdx index 3935ffb..169cc6d 100644 --- a/self-hosted/docker-compose.mdx +++ b/self-hosted/docker-compose.mdx @@ -27,6 +27,37 @@ This guide provides step-by-step instructions to install and configure DeploySta - **Docker & Docker Compose**: Make sure both are installed and up-to-date. - **Storage**: At least 2GB of available disk space for images and persistent data. +### Linux Host Requirements + + +**For Linux deployments only**: To enable nsjail process isolation in the satellite service, your Linux host must have unprivileged user namespaces enabled. + + +Check if already enabled: + +```bash +cat /proc/sys/kernel/unprivileged_userns_clone +# Should return: 1 +``` + +If not enabled (returns 0 or file doesn't exist), enable it: + +```bash +echo 'kernel.unprivileged_userns_clone=1' | sudo tee /etc/sysctl.d/00-userns.conf +sudo sysctl -p /etc/sysctl.d/00-userns.conf +``` + +Verify the setting: + +```bash +cat /proc/sys/kernel/unprivileged_userns_clone +# Should now return: 1 +``` + + +**Note**: This setting is only required for production Linux deployments. Development on macOS/Windows doesn't need this configuration. + + ## Beggining the setup for Docker Compose Follow these steps for a setup with docker compsoe @@ -157,6 +188,20 @@ The satellite service is **already included** in the docker-compose.yml file. Yo **Note**: After initial registration, the satellite saves its API key to persistent storage and doesn't need the registration token for subsequent starts. + +**Alternative for systems without host-level sysctl configuration**: If you cannot modify the host system's kernel parameters, you can add the `--sysctl` flag to the satellite service in your `docker-compose.yml`: + +```yaml +services: + satellite: + # ... other configuration ... + sysctls: + - kernel.unprivileged_userns_clone=1 +``` + +Note that this requires Docker to run with additional privileges and is less secure than configuring the host system directly. + + ## Configuration ### External Access diff --git a/self-hosted/quick-start.mdx b/self-hosted/quick-start.mdx index 8ea7100..6d94070 100644 --- a/self-hosted/quick-start.mdx +++ b/self-hosted/quick-start.mdx @@ -17,6 +17,37 @@ Get DeployStack up and running in minutes. This guide covers deploying the core - **Docker Compose**: [Install Docker Compose](https://docs.docker.com/compose/install/) - **System Requirements**: 4GB RAM, 20GB disk space +### Linux Host Requirements + + +**For Linux deployments only**: To enable nsjail process isolation in the satellite service, your Linux host must have unprivileged user namespaces enabled. + + +Check if already enabled: + +```bash +cat /proc/sys/kernel/unprivileged_userns_clone +# Should return: 1 +``` + +If not enabled (returns 0 or file doesn't exist), enable it: + +```bash +echo 'kernel.unprivileged_userns_clone=1' | sudo tee /etc/sysctl.d/00-userns.conf +sudo sysctl -p /etc/sysctl.d/00-userns.conf +``` + +Verify the setting: + +```bash +cat /proc/sys/kernel/unprivileged_userns_clone +# Should now return: 1 +``` + + +**Note**: This setting is only required for production Linux deployments. Development on macOS/Windows doesn't need this configuration. + + ## Method 1: Docker Compose (Recommended) The fastest way to get DeployStack running with proper networking and persistence. @@ -198,16 +229,42 @@ After completing the basic backend and frontend setup, deploy at least one satel - ```bash - docker run -d \ - --name deploystack-satellite \ - -p 3001:3001 \ - -e DEPLOYSTACK_BACKEND_URL="http://localhost:3000" \ - -e DEPLOYSTACK_SATELLITE_NAME="my-satellite-001" \ - -e DEPLOYSTACK_REGISTRATION_TOKEN="your-token-here" \ - -v deploystack_satellite_persistent:/app/persistent_data \ - deploystack/satellite:latest - ``` + + + If you configured the host system with unprivileged user namespaces (recommended): + + ```bash + docker run -d \ + --name deploystack-satellite \ + -p 3001:3001 \ + -e DEPLOYSTACK_BACKEND_URL="http://localhost:3000" \ + -e DEPLOYSTACK_SATELLITE_NAME="my-satellite-001" \ + -e DEPLOYSTACK_REGISTRATION_TOKEN="your-token-here" \ + -v deploystack_satellite_persistent:/app/persistent_data \ + deploystack/satellite:latest + ``` + + + + If you cannot modify the host system, pass the sysctl setting at runtime: + + ```bash + docker run -d \ + --name deploystack-satellite \ + --sysctl kernel.unprivileged_userns_clone=1 \ + -p 3001:3001 \ + -e DEPLOYSTACK_BACKEND_URL="http://localhost:3000" \ + -e DEPLOYSTACK_SATELLITE_NAME="my-satellite-001" \ + -e DEPLOYSTACK_REGISTRATION_TOKEN="your-token-here" \ + -v deploystack_satellite_persistent:/app/persistent_data \ + deploystack/satellite:latest + ``` + + + The `--sysctl` flag requires Docker to be run with additional privileges. For production deployments, configuring the host system is preferred. + + + **Satellite Name Requirements:**