fix: smart configuration path resolution for response-filter.json

srivers-groupon · srivers-groupon · commit 4eda469bb8ec · 2025-06-03T13:21:29.000-05:00
- Automatically locate response-filter.json in same directory as server config
- Fixes 'Response filter configuration not found' errors
- Adds global DATAPROC_CONFIG_DIR for consistent path resolution
- Enhanced logging for configuration path debugging
- Maintains backward compatibility with fallback to process.cwd()

Resolves configuration path issues when MCP server runs from different directories
diff --git a/.gitignore b/.gitignore
@@ -112,3 +112,4 @@ SERVICE_ACCOUNT_AUTHENTICATION_GUIDE.md
 examples/web-apps/*
 
 state/*
+scripts/copy-config-to-desktop.sh
diff --git a/config/response-filter.json b/config/response-filter.json
@@ -49,7 +49,10 @@
     "url": "http://localhost:6334",
     "collectionName": "dataproc_knowledge",
     "vectorSize": 384,
-    "distance": "Cosine"
+    "distance": "Cosine",
+    "timeout": 30000,
+    "retryAttempts": 3,
+    "healthCheckInterval": 60000
   },
   "formatting": {
     "useEmojis": true,
diff --git a/config/server-with-qdrant.json.example b/config/server-with-qdrant.json.example
@@ -0,0 +1,67 @@
+{
+  "profileManager": {
+    "rootConfigPath": "/Users/srivers/Documents/Cline/MCP/dataproc-server/profiles",
+    "profileScanInterval": 300000
+  },
+  "clusterTracker": {
+    "stateFilePath": "./state/dataproc-state.json",
+    "stateSaveInterval": 60000
+  },
+  "authentication": {
+    "impersonateServiceAccount": "grpn-sa-terraform-data-science@prj-grp-central-sa-prod-0b25.iam.gserviceaccount.com",
+    "preferImpersonation": true,
+    "useApplicationDefaultFallback": false     
+  },
+  "defaultParameters": {
+    "defaultEnvironment": "production",
+    "parameters": [],
+    "environments": [
+      {
+        "environment": "production",
+        "parameters": {
+          "machineType": "n1-standard-8",
+          "numWorkers": 4,
+          "projectId": "prj-grp-data-sci-prod-b425",
+          "region": "us-central1"
+        }
+      }
+    ]
+  },
+  "semanticSearch": {
+    "enabled": true,
+    "qdrant": {
+      "url": "http://localhost:6334",
+      "collectionName": "dataproc_knowledge",
+      "vectorSize": 384,
+      "distance": "Cosine",
+      "timeout": 30000,
+      "retryAttempts": 3,
+      "healthCheckInterval": 60000
+    },
+    "knowledgeBase": {
+      "autoIndexing": true,
+      "indexingInterval": 300000,
+      "maxDocuments": 10000,
+      "confidenceThreshold": 0.7
+    }
+  },
+  "responseOptimization": {
+    "enabled": true,
+    "tokenLimits": {
+      "list_clusters": 500,
+      "get_cluster": 300,
+      "default": 400
+    },
+    "caching": {
+      "enabled": true,
+      "ttlSeconds": 300,
+      "maxCacheSize": 100
+    }
+  },
+  "logging": {
+    "level": "info",
+    "enableConsole": true,
+    "enableFile": false,
+    "semanticSearchLogs": true
+  }
+}
diff --git a/docker-compose.qdrant.yml b/docker-compose.qdrant.yml
@@ -0,0 +1,26 @@
+version: '3.8'
+
+services:
+  qdrant:
+    image: qdrant/qdrant:latest
+    container_name: dataproc-qdrant
+    ports:
+      - "6334:6333"  # HTTP API
+      - "6335:6334"  # gRPC API (optional)
+    volumes:
+      - qdrant_storage:/qdrant/storage
+    environment:
+      - QDRANT__SERVICE__HTTP_PORT=6333
+      - QDRANT__SERVICE__GRPC_PORT=6334
+      - QDRANT__LOG_LEVEL=INFO
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:6333/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+
+volumes:
+  qdrant_storage:
+    driver: local
diff --git a/example_dataproc_opp.yml b/example_dataproc_opp.yml
@@ -0,0 +1,128 @@
+customModes:
+  - slug: dataproc-ops
+    name: 🔧 DataprocOps
+    roleDefinition: >-
+      You are Roo, a Dataproc operations specialist with enhanced MCP capabilities, intelligent parameter management, and advanced semantic search. Your expertise includes:
+      - Managing Google Cloud Dataproc clusters with smart default parameters
+      - Executing and monitoring Hive/Spark jobs with minimal parameter requirements
+      - Leveraging MCP resources for configuration access (dataproc://config/defaults, dataproc://profile/*)
+      - Using memory tools to store and retrieve operational insights
+      - Optimizing cluster and job configurations based on historical usage
+      - Utilizing the enhanced Dataproc MCP server with 60-80% reduced parameter requirements
+      - Performing semantic searches with natural language queries (e.g., "clusters with machine learning packages")
+      - Extracting intelligent insights from cluster configurations using vector embeddings
+      - Providing graceful degradation when optional semantic features are unavailable
+    whenToUse: >-
+      Use this mode when working with Google Cloud Dataproc operations, including:
+      - Creating or managing Dataproc clusters (now requires minimal parameters)
+      - Submitting and monitoring Hive/Spark jobs (simplified with smart defaults)
+      - Managing cluster profiles and configurations via MCP resources
+      - Analyzing job performance and cluster utilization
+      - Leveraging the enhanced MCP server with intelligent default parameter injection
+      - Accessing cluster configurations through dataproc:// resource URIs
+      - Performing semantic searches for cluster discovery and analysis
+      - Extracting insights from configurations using natural language queries
+      - Analyzing infrastructure patterns and optimization opportunities
+    groups:
+      - read
+      - - edit
+        - fileRegex: \.(yaml|json|sql|hql)$
+          description: sql/hql query files and YAML and JSON configuration files
+      - mcp
+      - command
+    customInstructions: |-
+      ENHANCED WORKFLOW (Updated for Smart Defaults, Resources & Semantic Search):
+
+      1. **Smart Parameter Management**:
+        - Leverage default parameter injection (projectId/region auto-filled)
+        - Use minimal parameters for tool calls (e.g., get_job_status with just jobId)
+        - Access default configuration via 'dataproc://config/defaults' resource
+        - Store custom parameters in memory only when they differ from defaults
+
+      2. **Resource-Enhanced Operations**:
+        - Use 'dataproc://profile/{id}' resources to access cluster profiles
+        - Leverage 'dataproc://config/defaults' for current environment settings
+        - Access tracked clusters via dataproc:// resource URIs
+        - Store resource URIs in memory for quick access
+
+      3. **Simplified Cluster Operations**:
+        - Use 'start_dataproc_cluster' with just clusterName (defaults auto-inject)
+        - Use 'list_clusters' with no parameters (uses configured defaults)
+        - Apply profile-based configurations via 'create_cluster_from_profile'
+        - Monitor cluster health with simplified parameter sets
+
+      4. **Streamlined Job Execution**:
+        - Use 'get_job_status' with only jobId (projectId/region from defaults)
+        - Submit jobs with minimal required parameters
+        - Track job performance with simplified monitoring calls
+        - Store successful job patterns with reduced parameter sets
+
+      5. **Enhanced Configuration Management**:
+        - Access profiles via MCP resources instead of file system
+        - Update default-params.json for environment-specific settings
+        - Version control configurations with smart parameter awareness
+        - Maintain profile templates accessible via dataproc:// URIs
+
+      6. **🧠 Semantic Search & Knowledge Base**:
+        - Use 'query_cluster_data' for natural language infrastructure queries
+        - Add semanticQuery parameter to 'list_clusters' and 'get_cluster' for intelligent filtering
+        - Query with natural language: "clusters with machine learning packages", "high-memory configurations"
+        - Store semantic insights in memory for pattern recognition and optimization
+        - Leverage confidence scoring to prioritize relevant results
+        - Use 'query_knowledge' for comprehensive knowledge base searches across clusters, jobs, and errors
+
+      7. **🎯 Intelligent Data Extraction**:
+        - Extract meaningful insights from cluster configurations automatically
+        - Identify patterns in machine types, pip packages, network configurations
+        - Analyze component installations and optimization opportunities
+        - Store extracted knowledge for future reference and comparison
+        - Use vector embeddings for semantic similarity matching
+
+      8. **🔄 Graceful Degradation Handling**:
+        - Provide helpful setup guidance when Qdrant is unavailable
+        - Maintain full functionality with standard queries when semantic search is offline
+        - Guide users through semantic search setup: "docker run -p 6334:6333 qdrant/qdrant"
+        - Explain benefits of semantic search while providing standard alternatives
+
+      ALWAYS:
+      - **Leverage smart defaults**: Use minimal parameters, let server inject defaults
+      - **Access MCP resources**: Use dataproc:// URIs for configuration access
+      - **Store operational insights**: Use memory for patterns, not basic parameters
+      - **Optimize with defaults**: Configure default-params.json for your environment
+      - **Maintain audit trail**: Track operations with simplified parameter logging
+      - **Test resource access**: Verify dataproc:// resources are available before operations
+      - **🧠 Use semantic search**: Leverage natural language queries for intelligent data discovery
+      - **📊 Extract insights**: Store meaningful patterns and configurations in memory
+      - **🎯 Provide guidance**: Help users understand semantic search benefits and setup
+
+      KEY ENHANCEMENTS:
+      - 60-80% fewer parameters required for most operations
+      - Direct access to configurations via MCP resources
+      - Environment-independent authentication with service account impersonation
+      - 53-58% faster operations with authentication caching
+      - 🧠 **Natural language cluster discovery** with semantic search
+      - 🎯 **Intelligent data extraction** from configurations and responses
+      - 📊 **Confidence-scored results** for better decision making
+      - 🔄 **Graceful degradation** maintaining functionality without dependencies
+
+      SEMANTIC SEARCH EXAMPLES:
+      ```javascript
+      // Natural language cluster discovery
+      query_cluster_data: { "query": "pip packages for data science" }
+      list_clusters: { "semanticQuery": "high-memory instances with SSD" }
+      get_cluster: { "clusterName": "ml-cluster", "semanticQuery": "Python libraries and packages" }
+      
+      // Knowledge base queries
+      query_knowledge: { "query": "machine learning configurations", "type": "clusters" }
+      query_knowledge: { "query": "failed jobs with memory errors", "type": "errors" }
+      ```
+
+      SETUP GUIDANCE FOR SEMANTIC FEATURES:
+      1. 🐳 Start Qdrant: `docker run -p 6334:6333 qdrant/qdrant`
+      2. ✅ Verify: `curl http://localhost:6334/health`
+      3. 🔄 Restart MCP server to enable semantic features
+      4. 📖 Full docs: docs/KNOWLEDGE_BASE_SEMANTIC_SEARCH.md
+      
+
+
+
diff --git a/scripts/setup-qdrant.sh b/scripts/setup-qdrant.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Qdrant Setup Script for Dataproc MCP Server
+# This script sets up Qdrant vector database for semantic search
+
+set -e
+
+echo "🐳 Setting up Qdrant for Dataproc MCP Server..."
+
+# Check if Docker is running
+if ! docker info > /dev/null 2>&1; then
+    echo "❌ Docker is not running. Please start Docker first."
+    exit 1
+fi
+
+# Stop existing Qdrant containers
+echo "🛑 Stopping existing Qdrant containers..."
+docker stop dataproc-qdrant 2>/dev/null || true
+docker rm dataproc-qdrant 2>/dev/null || true
+
+# Start Qdrant using Docker Compose
+echo "🚀 Starting Qdrant with Docker Compose..."
+docker-compose -f docker-compose.qdrant.yml up -d
+
+# Wait for Qdrant to be ready
+echo "⏳ Waiting for Qdrant to be ready..."
+for i in {1..30}; do
+    if curl -s http://localhost:6334/health > /dev/null 2>&1; then
+        echo "✅ Qdrant is ready!"
+        break
+    fi
+    if [ $i -eq 30 ]; then
+        echo "❌ Qdrant failed to start after 30 seconds"
+        docker-compose -f docker-compose.qdrant.yml logs
+        exit 1
+    fi
+    sleep 1
+done
+
+# Verify Qdrant health
+echo "🔍 Verifying Qdrant health..."
+HEALTH_RESPONSE=$(curl -s http://localhost:6334/health)
+echo "Health check response: $HEALTH_RESPONSE"
+
+# Check collections
+echo "📊 Checking existing collections..."
+curl -s http://localhost:6334/collections | jq '.' || echo "No collections yet (this is normal)"
+
+echo ""
+echo "🎉 Qdrant setup complete!"
+echo ""
+echo "📋 Configuration Summary:"
+echo "  • Qdrant URL: http://localhost:6334"
+echo "  • Container: dataproc-qdrant"
+echo "  • Storage: Persistent volume (qdrant_storage)"
+echo "  • Health endpoint: http://localhost:6334/health"
+echo ""
+echo "🔧 Next Steps:"
+echo "  1. Restart your MCP server to enable semantic search"
+echo "  2. Test with: query_cluster_data or list_clusters with semanticQuery"
+echo "  3. Check logs: docker-compose -f docker-compose.qdrant.yml logs"
+echo ""
+echo "🛑 To stop: docker-compose -f docker-compose.qdrant.yml down"
+echo "🗑️  To reset: docker-compose -f docker-compose.qdrant.yml down -v"
diff --git a/src/config/server.ts b/src/config/server.ts
@@ -7,6 +7,12 @@ import * as path from 'path';
 import { fileURLToPath } from 'url';
 import { ProfileManagerConfig, ClusterTrackerConfig } from '../types/profile.js';
 
+// Global type declaration for config directory
+declare global {
+  // eslint-disable-next-line no-var
+  var DATAPROC_CONFIG_DIR: string;
+}
+
 // Determine the application root directory
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
@@ -124,6 +130,12 @@ export async function getServerConfig(configPath?: string): Promise<ServerConfig
   console.error(`[DIAGNOSTIC] Server Config: Current working directory: ${process.cwd()}`);
   console.error(`[DIAGNOSTIC] Server Config: Absolute config path: ${filePath}`);
 
+  // Store the config directory for other modules to use
+  // eslint-disable-next-line no-undef
+  global.DATAPROC_CONFIG_DIR = path.dirname(filePath);
+  // eslint-disable-next-line no-undef
+  console.error(`[DIAGNOSTIC] Server Config: Config directory: ${global.DATAPROC_CONFIG_DIR}`);
+
   try {
     // Check if the config file exists
     try {
diff --git a/src/index.ts b/src/index.ts
@@ -126,7 +126,23 @@ try {
 // Initialize response filter and Qdrant manager (async initialization)
 async function initializeResponseOptimization() {
   try {
-    const responseFilterConfigPath = path.join(process.cwd(), 'config', 'response-filter.json');
+    // Try to use the same directory as the main server config
+    let responseFilterConfigPath: string;
+
+    if (global.DATAPROC_CONFIG_DIR) {
+      // Use the same directory as server_main.json
+      responseFilterConfigPath = path.join(global.DATAPROC_CONFIG_DIR, 'response-filter.json');
+      console.log(
+        `[INFO] Looking for response-filter.json in server config directory: ${responseFilterConfigPath}`
+      );
+    } else {
+      // Fallback to the old behavior
+      responseFilterConfigPath = path.join(process.cwd(), 'config', 'response-filter.json');
+      console.log(
+        `[INFO] Fallback: Looking for response-filter.json in: ${responseFilterConfigPath}`
+      );
+    }
+
     if (fs.existsSync(responseFilterConfigPath)) {
       const responseFilterConfig = JSON.parse(fs.readFileSync(responseFilterConfigPath, 'utf8'));
 
diff --git a/src/services/response-filter.ts b/src/services/response-filter.ts
@@ -124,13 +124,30 @@ export class ResponseFilter {
    */
   private static async loadConfig(): Promise<ResponseFilterConfig> {
     try {
-      const configPath = path.join(process.cwd(), 'config', 'response-filter.json');
+      // Try to use the same directory as the main server config
+      let configPath: string;
+
+      // eslint-disable-next-line no-undef
+      if (global.DATAPROC_CONFIG_DIR) {
+        // Use the same directory as server_main.json
+        // eslint-disable-next-line no-undef
+        configPath = path.join(global.DATAPROC_CONFIG_DIR, 'response-filter.json');
+        console.log(
+          `[INFO] Looking for response-filter.json in server config directory: ${configPath}`
+        );
+      } else {
+        // Fallback to the old behavior
+        configPath = path.join(process.cwd(), 'config', 'response-filter.json');
+        console.log(`[INFO] Fallback: Looking for response-filter.json in: ${configPath}`);
+      }
+
       const configData = await fs.readFile(configPath, 'utf-8');
       const config = JSON.parse(configData) as ResponseFilterConfig;
 
       // Validate required fields
       ResponseFilter.validateConfig(config);
 
+      console.log(`[SUCCESS] Loaded response filter configuration from: ${configPath}`);
       return config;
     } catch (error) {
       console.warn('Failed to load response filter config, using defaults:', error);

Original file line number	Diff line number	Diff line change
`@@ -112,3 +112,4 @@ SERVICE_ACCOUNT_AUTHENTICATION_GUIDE.md`
`112`	`112`	`examples/web-apps/*`
`113`	`113`
`114`	`114`	`state/*`
	`115`	`+scripts/copy-config-to-desktop.sh`