From 7d9923651cde9a1c2115f1a687f1d9f021ca613d Mon Sep 17 00:00:00 2001
From: Anuj Sharma <anshrma@amazon.com>
Date: Tue, 11 Nov 2025 12:57:07 -0800
Subject: [PATCH 1/7] Refactor deployment pipeline and add Elasticsearch
 observability

- Move Docker image building from app-template-generator to TemplateStorageStack
- Add Elasticsearch-based observability integration with new elastic.py module
- Remove CloudFormation deployment service in favor of unified deployment approach
- Update UI components and configuration models to support new deployment flow
- Enhance stack definitions with force deployment capabilities
---
 app-template-generator.py                     | 155 +-----
 .../common/custom_bedrock_provider.py         |  91 ++--
 .../common/memory/elasticsearch.py            | 215 +++-----
 .../common/observability/__init__.py          |  32 +-
 .../common/observability/elastic.py           | 236 ++++++++
 .../configuration-api/app/api/config.py       |  77 ++-
 .../configuration-api/app/api/deployment.py   | 154 +++---
 .../app/models/agent_config.py                |   8 +
 .../app/models/form_schema.py                 |  89 +--
 .../app/models/ssm_data_models.py             |  17 +-
 .../app/services/agent_config_service.py      | 234 +++++++-
 .../cloudformation_deployment_service.py      | 514 ------------------
 .../app/services/deployment_service.py        | 508 +++++++++++++----
 .../app/services/parameter_initialization.py  |   4 +-
 .../app/utils/dependencies.py                 |  24 -
 .../ui-react-cloudscape/package-lock.json     |  21 +
 .../ui-react-cloudscape/package.json          |   2 +
 application_src/ui-react-cloudscape/server.js |  95 +++-
 .../src/components/AgentMapping.js            |  78 ++-
 .../src/components/AgentWizard.js             | 145 +++--
 .../src/components/ChatInterface.js           | 108 ++--
 .../src/services/configuration.js             |  21 +
 stacks/common/base.py                         |  40 +-
 stacks/multi_agent/stack.py                   | 150 ++++-
 stacks/template_storage/stack.py              |  99 +++-
 25 files changed, 1860 insertions(+), 1257 deletions(-)
 create mode 100644 application_src/common/observability/elastic.py
 delete mode 100644 application_src/configuration-api/app/services/cloudformation_deployment_service.py

diff --git a/app-template-generator.py b/app-template-generator.py
index 98dda69..764fe7b 100644
--- a/app-template-generator.py
+++ b/app-template-generator.py
@@ -1,118 +1,31 @@
 #!/usr/bin/env python3
 """
 CDK app for generating agent template without deployment.
-This app synthesizes the MultiAgentStack as a standalone CloudFormation template,
-builds and pushes the Docker image to ECR, and uploads everything to S3.
+This app synthesizes the MultiAgentStack as a standalone CloudFormation template
+and prepares it for upload to S3 during deployment.
+
+The Docker image build is now handled by TemplateStorageStack during deployment,
+following the same pattern as template upload to S3.
 
 This runs as a pre-hook before 'cdk deploy' to ensure the template
-is always up-to-date with the stack definition and available in S3.
+is always up-to-date with the stack definition.
 """
 import os
 import json
 import boto3
 import sys
-import shlex
-import subprocess
-from aws_cdk import App, Fn, Environment, Stack
-from aws_cdk import aws_ec2 as ec2, aws_ecs as ecs, aws_s3 as s3, aws_ecr_assets as ecr_assets
+import logging
+from aws_cdk import App, Fn, Environment
 from stacks.multi_agent.stack import MultiAgentStack
 from helper.config import Config
 
-
-def build_and_push_agent_image(app: App, account_id: str, region: str) -> str:
-    """
-    Build and push the agent Docker image to ECR using direct Docker commands.
-    
-    Args:
-        app: CDK app instance  
-        account_id: AWS account ID
-        region: AWS region
-        
-    Returns:
-        ECR image URI with stable tag
-    """
-    print("\n🐳 Building and pushing agent Docker image to ECR...")
-    
-    import hashlib
-    
-    # Use stable tag based on directory content hash
-    ecr_repo = f"{account_id}.dkr.ecr.{region}.amazonaws.com/cdk-hnb659fds-container-assets-{account_id}-{region}"
-    stable_tag = "agent-instance-latest"
-    image_uri = f"{ecr_repo}:{stable_tag}"
-    
-    # Check if Docker/Podman is available - use safe validated default
-    docker_executable = os.environ.get('CDK_DOCKER', 'docker')
-    
-    # SECURITY: Validate docker executable to prevent command injection
-    allowed_executables = ['docker', 'podman']
-    if docker_executable not in allowed_executables:
-        raise ValueError(f"Invalid container runtime specified: {docker_executable}. Allowed: {allowed_executables}")
-    
-    try:
-        # 1. Login to ECR - separate password retrieval and login for security
-        print(f"  Logging in to ECR...")
-        
-        # Get ECR password securely - validate region parameter
-        if not region or not region.replace('-', '').replace('_', '').isalnum():
-            raise ValueError(f"Invalid AWS region: {region}")
-            
-        password_result = subprocess.run(
-            ['aws', 'ecr', 'get-login-password', '--region', region],
-            check=True,
-            capture_output=True,
-            text=True
-        )
-        
-        # Login to ECR with password from stdin - validate parameters
-        if not account_id or not account_id.isdigit() or len(account_id) != 12:
-            raise ValueError(f"Invalid AWS account ID: {account_id}")
-            
-        ecr_endpoint = f"{account_id}.dkr.ecr.{region}.amazonaws.com"
-            
-        subprocess.run(
-            [docker_executable, 'login', '--username', 'AWS', '--password-stdin', ecr_endpoint],
-            input=password_result.stdout,
-            check=True,
-            capture_output=True,
-            text=True
-        )
-        
-        # 2. Build image - validate parameters
-        print(f"  Building Docker image...")
-        if not image_uri or '://' in image_uri.split(':')[0]:  # Basic URI validation
-            raise ValueError(f"Invalid image URI: {image_uri}")
-            
-        subprocess.run(
-            [docker_executable, 'build',
-             '-t', image_uri,
-             '--platform', 'linux/arm64',
-             '-f', 'application_src/multi-agent/agent-instance/Dockerfile',
-             'application_src'],
-            check=True,
-            capture_output=True
-        )
-        
-        # 3. Push to ECR
-        print(f"  Pushing image to ECR...")
-        subprocess.run(
-            [docker_executable, 'push', image_uri],
-            check=True,
-            capture_output=True
-        )
-        
-        print(f"✓ Docker image built and pushed to ECR: {image_uri}")
-        return image_uri
-        
-    except subprocess.CalledProcessError as e:
-        print(f"⚠️  Warning: Failed to build/push Docker image: {e}")
-        print(f"  Using placeholder - manual push required")
-        return image_uri
-
-
+logger = logging.getLogger(__name__)
 
 
 def main():
     """Generate the template and save to file."""
+    logger.info("Generating agent template...")
+    
     # Get actual AWS account and region
     try:
         sts = boto3.client('sts')
@@ -125,10 +38,6 @@ def main():
     # Create app
     app = App()
     
-    # Step 1: Build and push Docker image to ECR BEFORE creating the stack
-    # This ensures the image exists in ECR before the CloudFormation template references it
-    agent_image_uri = build_and_push_agent_image(app, account_id, region)
-    
     # Get configuration
     environment = app.node.try_get_context("environment") or os.environ.get("ENVIRONMENT", "development")
     conf = Config(environment=environment)
@@ -136,7 +45,7 @@ def main():
     # Get project name from config (should match app.py)
     project_name_from_config = conf.get('ProjectName')
     
-    print(f"\n📦 Using CloudFormation exports from VPC stack...")
+    logger.info("Using CloudFormation exports from VPC stack...")
     
     # Import VPC ID using CloudFormation export token
     # The MultiAgentStack will handle importing the VPC and all related resources
@@ -151,10 +60,10 @@ def main():
     # Import VPC Lattice service network ARN using CloudFormation export
     service_network_arn = Fn.import_value(f"{project_name_from_config}-vpc:ExportsOutputFnGetAttservicenetworkArnD9BDB9C7")
     
-    print(f"✓ All infrastructure references use CloudFormation imports")
-    print(f"  - VPC ID: {{Fn::ImportValue: {project_name_from_config}-VpcId}}")
-    print(f"  - Cluster: {{Fn::ImportValue: {project_name_from_config}-ClusterName}}")
-    print(f"  - Bucket: {{Fn::ImportValue: {project_name_from_config}-AccessLogBucketName}}")
+    logger.info("All infrastructure references use CloudFormation imports")
+    logger.debug(f"  - VPC ID: {{Fn::ImportValue: {project_name_from_config}-VpcId}}")
+    logger.debug(f"  - Cluster: {{Fn::ImportValue: {project_name_from_config}-ClusterName}}")
+    logger.debug(f"  - Bucket: {{Fn::ImportValue: {project_name_from_config}-AccessLogBucketName}}")
     
     # Create the agent stack - MultiAgentStack will import all resources internally
     # Pass tokens directly - no boto3 queries needed!
@@ -183,25 +92,17 @@ def main():
         with open(template_path, 'r', encoding='utf-8') as f:
             template_content = json.load(f)
         
-        # Replace Docker image reference with stable tag
-        task_def = template_content['Resources']['agenttaskdefinitionF56FAA50']
-        container = task_def['Properties']['ContainerDefinitions'][0]
-        
-        # Update image to use stable tag
-        container['Image'] = {
-            'Fn::Sub': f"{account_id}.dkr.ecr.{region}.${{AWS::URLSuffix}}/cdk-hnb659fds-container-assets-{account_id}-{region}:agent-instance-latest"
-        }
-        
-        print(f"✓ Updated template to use stable Docker image tag: agent-instance-latest")
+        # Log that template uses ImageTag parameter (will be updated by TemplateStorageStack)
+        logger.info("Template uses ImageTag parameter (default will be updated during deployment)")
         
         # Save to standard location for easy access
         output_path = "cdk.out/GenericAgentTemplate.json"
         with open(output_path, 'w', encoding='utf-8') as f:
             json.dump(template_content, f, indent=2)
         
-        print(f"✓ Template generated: {output_path}")
-        print(f"  Stack name: {agent_stack.stack_name}")
-        print(f"  Template size: {len(json.dumps(template_content))} bytes")
+        logger.info(f"Template generated: {output_path}")
+        logger.debug(f"  Stack name: {agent_stack.stack_name}")
+        logger.debug(f"  Template size: {len(json.dumps(template_content))} bytes")
         
         # List resources for reference
         if 'Resources' in template_content:
@@ -211,18 +112,18 @@ def main():
                 rtype = resource.get('Type', 'Unknown')
                 resource_types[rtype] = resource_types.get(rtype, 0) + 1
             
-            print(f"  Total resources: {resource_count}")
-            print(f"  Resource breakdown:")
+            logger.debug(f"  Total resources: {resource_count}")
+            logger.debug("  Resource breakdown:")
             for rtype, count in sorted(resource_types.items()):
-                print(f"    - {rtype}: {count}")
+                logger.debug(f"    - {rtype}: {count}")
         
-        # Note: Template will be automatically uploaded to S3 by TemplateStorageStack
-        # using CDK's BucketDeployment construct during 'cdk deploy'
-        print(f"\n📤 Template will be uploaded to S3 by TemplateStorageStack during deployment")
+        logger.info("Template and Docker image will be deployed by TemplateStorageStack")
+        logger.debug("  - Template: Uploaded to S3 during deployment")
+        logger.debug("  - Docker Image: Built and pushed to ECR during deployment")
         
         return True
     else:
-        print(f"Error: Template not found at {template_path}")
+        logger.error(f"Template not found at {template_path}")
         return False
 
 
diff --git a/application_src/common/custom_bedrock_provider.py b/application_src/common/custom_bedrock_provider.py
index 87d858c..da4c497 100644
--- a/application_src/common/custom_bedrock_provider.py
+++ b/application_src/common/custom_bedrock_provider.py
@@ -103,9 +103,9 @@ def _initialize_client(self):
                     tcp_keepalive=True  # Enable TCP keepalive for long connections
                 )
             )
-            logger.info(f"✅ Custom Bedrock client initialized for {self.config['model_id']} in region {self.config['region']}")
+            logger.info(f"Custom Bedrock client initialized for {self.config['model_id']} in region {self.config['region']}")
         except Exception as e:
-            logger.warning(f"❌ Failed to initialize custom Bedrock client: {e}")
+            logger.error(f"Failed to initialize custom Bedrock client: {e}")
             raise
     
     def _is_throttling_error(self, error: Exception) -> bool:
@@ -128,7 +128,7 @@ def _is_throttling_error(self, error: Exception) -> bool:
             # Check error code first (most reliable)
             for code in throttling_codes:
                 if code in error_code:
-                    logger.warning(f"🚨 BEDROCK THROTTLING CODE: {error_code}")
+                    logger.warning(f"Bedrock throttling detected - error code: {error_code}")
                     return True
             
             # Check error message for throttling indicators
@@ -140,7 +140,7 @@ def _is_throttling_error(self, error: Exception) -> bool:
             
             for msg in throttling_messages:
                 if msg in error_message:
-                    logger.warning(f"🚨 BEDROCK THROTTLING MESSAGE: {error_message}")
+                    logger.warning(f"Bedrock throttling detected - message: {error_message}")
                     return True
         
         return False
@@ -167,16 +167,17 @@ def _is_connection_error(self, error: Exception) -> bool:
         
         for pattern in connection_errors:
             if pattern in error_message:
-                logger.warning(f"🔌 CONNECTION ERROR DETECTED: {error_message}")
+                logger.warning(f"Connection error detected: {error_message}")
                 return True
         
         return False
     
     def _format_messages_for_bedrock(self, messages: Messages) -> list[dict[str, Any]]:
-        """Convert Strands Messages to Bedrock converse_stream format."""
+        """Convert Strands Messages to Bedrock converse_stream format with validation."""
         bedrock_messages = []
+        tool_use_ids = set()  # Track tool use IDs to validate results
         
-        for message in messages:
+        for msg_idx, message in enumerate(messages):
             role = message.get('role', 'user')
             content_blocks = message.get('content', [])
             
@@ -190,9 +191,13 @@ def _format_messages_for_bedrock(self, messages: Messages) -> list[dict[str, Any
                 elif isinstance(block, dict) and 'toolUse' in block:
                     # Handle tool use blocks
                     tool_use = block['toolUse']
+                    tool_use_id = tool_use.get('toolUseId')
+                    if tool_use_id:
+                        tool_use_ids.add(tool_use_id)
+                        logger.debug(f"Registered tool use ID: {tool_use_id}")
                     bedrock_content.append({
                         "toolUse": {
-                            "toolUseId": tool_use.get('toolUseId'),
+                            "toolUseId": tool_use_id,
                             "name": tool_use.get('name'),
                             "input": tool_use.get('input', {})
                         }
@@ -200,12 +205,21 @@ def _format_messages_for_bedrock(self, messages: Messages) -> list[dict[str, Any
                 elif isinstance(block, dict) and 'toolResult' in block:
                     # Handle tool result blocks
                     tool_result = block['toolResult']
+                    tool_use_id = tool_result.get('toolUseId')
+                    
+                    # Validate that this tool result matches a previous tool use
+                    if tool_use_id not in tool_use_ids:
+                        logger.warning(f"Tool result ID {tool_use_id} at message {msg_idx} doesn't match any previous tool use. Available IDs: {tool_use_ids}")
+                        # Skip this invalid tool result to prevent ValidationException
+                        continue
+                    
                     bedrock_content.append({
                         "toolResult": {
-                            "toolUseId": tool_result.get('toolUseId'),
+                            "toolUseId": tool_use_id,
                             "content": tool_result.get('content', [])
                         }
                     })
+                    logger.debug(f"Validated tool result ID: {tool_use_id}")
             
             # Only add messages with actual content
             if bedrock_content:
@@ -214,16 +228,17 @@ def _format_messages_for_bedrock(self, messages: Messages) -> list[dict[str, Any
                     "content": bedrock_content
                 })
             else:
-                logger.warning(f"⚠️ Skipping empty message with role: {role}")
+                logger.warning(f"Skipping empty message with role: {role}")
         
         # Ensure we have at least one message
         if not bedrock_messages:
-            logger.warning("⚠️ No valid messages found, adding default message")
+            logger.warning("No valid messages found, adding default message")
             bedrock_messages.append({
                 "role": "user",
                 "content": [{"text": "Hello"}]
             })
         
+        logger.debug(f"Formatted {len(bedrock_messages)} messages with {len(tool_use_ids)} tool use IDs")
         return bedrock_messages
     
     def _format_request_body(
@@ -296,9 +311,9 @@ async def stream(
             request = self._format_request_body(messages, system_prompt, tool_specs, **kwargs)
             
             if tool_specs:
-                logger.info(f"🛠️ Custom Bedrock request includes {len(tool_specs)} tools")
+                logger.debug(f"Custom Bedrock request includes {len(tool_specs)} tools")
             
-            logger.info(f"🎯 Custom Bedrock streaming with {self.config['model_id']}")
+            logger.info(f"Custom Bedrock streaming with {self.config['model_id']}")
             
             # Use converse_stream API like official SDK
             response = await asyncio.get_event_loop().run_in_executor(
@@ -315,10 +330,10 @@ async def stream(
                     # This follows the same pattern as the official SDK
                     yield chunk
             
-            logger.info(f"✅ Custom Bedrock streaming completed for {self.config['model_id']}")
+            logger.info(f"Custom Bedrock streaming completed for {self.config['model_id']}")
             
         except Exception as e:
-            logger.error(f"🚨 Custom Bedrock streaming error: {type(e).__name__}: {e}")
+            logger.exception("Custom Bedrock streaming error")
             
             # Check if this is throttling and convert to Strands exception
             if self._is_throttling_error(e):
@@ -326,7 +341,7 @@ async def stream(
                 raise ModelThrottledException(f"Custom Bedrock throttling detected: {e}") from e
             elif self._is_connection_error(e):
                 # Connection errors - treat as throttling to trigger model switching
-                logger.warning(f"🔌 Connection error treated as throttling: {e}")
+                logger.warning(f"Connection error treated as throttling: {e}")
                 raise ModelThrottledException(f"Custom Bedrock connection error: {e}") from e
             else:
                 # Re-raise non-throttling errors as-is
@@ -415,7 +430,7 @@ def __init__(self, available_models: Optional[list[str]] = None, model_manager=N
         self.current_model_index = 0
         self.model_cooldowns = {}  # model_id -> cooldown_until_timestamp
         
-        logger.info(f"✅ Model Switching Bedrock Provider initialized with {len(self.available_models)} models from shared config")
+        logger.info(f"Model Switching Bedrock Provider initialized with {len(self.available_models)} models from shared config")
     
     def get_next_available_model(self) -> Optional[str]:
         """
@@ -433,7 +448,7 @@ def get_next_available_model(self) -> Optional[str]:
         ]
         for model_id in expired_models:
             del self.model_cooldowns[model_id]
-            logger.info(f"🔄 Model {model_id} cooldown expired")
+            logger.info(f"Model {model_id} cooldown expired")
         
         # Find next available model
         for i in range(len(self.available_models)):
@@ -442,10 +457,10 @@ def get_next_available_model(self) -> Optional[str]:
             
             if next_model not in self.model_cooldowns:
                 self.current_model_index = next_index
-                logger.info(f"🎯 Next available model: {next_model}")
+                logger.info(f"Next available model: {next_model}")
                 return next_model
         
-        logger.warning("❌ No models available - all in cooldown")
+        logger.warning("No models available - all in cooldown")
         return None
     
     def put_model_in_cooldown(self, model_id: str, cooldown_seconds: Optional[int] = None):
@@ -455,7 +470,7 @@ def put_model_in_cooldown(self, model_id: str, cooldown_seconds: Optional[int] =
         
         cooldown_until = time.time() + cooldown_seconds
         self.model_cooldowns[model_id] = cooldown_until
-        logger.warning(f"🚨 Model {model_id} in cooldown for {cooldown_seconds}s (using shared config)")
+        logger.warning(f"Model {model_id} in cooldown for {cooldown_seconds}s (using shared config)")
     
     def create_switching_model(self, initial_model_id: Optional[str] = None, **kwargs) -> 'SwitchingBedrockModel':
         """
@@ -507,15 +522,15 @@ def __init__(
         
         # Initialize with the first model
         self.current_model = CustomBedrockModel(model_id=initial_model_id, **kwargs)
-        logger.info(f"🔧 Switching model initialized with {initial_model_id}")
+        logger.info(f"Switching model initialized with {initial_model_id}")
     
     def _switch_to_model(self, model_id: str):
         """Switch to a specific model."""
         try:
             self.current_model = CustomBedrockModel(model_id=model_id, **self.kwargs)
-            logger.info(f"🔄 Switched to model: {model_id}")
+            logger.info(f"Switched to model: {model_id}")
         except Exception as e:
-            logger.warning(f"🚨 Error in stream: {e}")
+            logger.error(f"Error in stream: {e}")
             raise
     
     async def stream(
@@ -532,22 +547,22 @@ async def stream(
         
         while self.switches_attempted <= self.max_switches:
             try:
-                logger.info(f"🎯 Attempting stream with model: {self.current_model.get_config()['model_id']} (switch attempt {self.switches_attempted})")
+                logger.info(f"Attempting stream with model: {self.current_model.get_config()['model_id']} (switch attempt {self.switches_attempted})")
                 
                 async for event in self.current_model.stream(messages, tool_specs, system_prompt, **kwargs):
                     yield event
                 
                 # Success - stream completed
-                logger.info(f"✅ Stream completed successfully with model: {self.current_model.get_config()['model_id']}")
+                logger.info(f"Stream completed successfully with model: {self.current_model.get_config()['model_id']}")
                 return
                 
             except ModelThrottledException as e:
                 self.switches_attempted += 1
                 current_model_id = self.current_model.get_config()['model_id']
-                logger.warning(f"🚨 Model {current_model_id} throttled (attempt {self.switches_attempted})")
+                logger.warning(f"Model {current_model_id} throttled (attempt {self.switches_attempted})")
                 
                 if self.switches_attempted > self.max_switches:
-                    logger.error(f"❌ Exceeded max model switches ({self.max_switches})")
+                    logger.error(f"Exceeded max model switches ({self.max_switches})")
                     raise Exception(f"All model switching attempts failed. Last error: {e}")
                 
                 # Put current model in cooldown
@@ -558,15 +573,15 @@ async def stream(
                 if next_model:
                     # Switch to next model
                     self._switch_to_model(next_model)
-                    logger.warning(f"🔄 IMMEDIATE MODEL SWITCH: {current_model_id} → {next_model}")
+                    logger.warning(f"Immediate model switch: {current_model_id} -> {next_model}")
                     continue
                 
                 # No more models available
-                logger.error(f"❌ No alternative models available")
+                logger.error("No alternative models available")
                 raise Exception(f"No alternative models available. Last error: {e}")
             
             except Exception as e:
-                logger.warning(f"🚨 Non-throttling error with model {self.current_model.get_config()['model_id']}: {e}")
+                logger.exception(f"Non-throttling error with model {self.current_model.get_config()['model_id']}")
                 raise e
         
         raise Exception(f"Stream failed after {self.switches_attempted} model switches")
@@ -604,21 +619,21 @@ async def structured_output(
         
         while self.switches_attempted <= self.max_switches:
             try:
-                logger.info(f"🎯 Attempting structured output with model: {self.current_model.get_config()['model_id']} (switch attempt {self.switches_attempted})")
+                logger.info(f"Attempting structured output with model: {self.current_model.get_config()['model_id']} (switch attempt {self.switches_attempted})")
                 
                 result = await self.current_model.structured_output(messages, schema, tool_specs, system_prompt, **kwargs)
                 
                 # Success - structured output completed
-                logger.info(f"✅ Structured output completed successfully with model: {self.current_model.get_config()['model_id']}")
+                logger.info(f"Structured output completed successfully with model: {self.current_model.get_config()['model_id']}")
                 return result
                 
             except ModelThrottledException as e:
                 self.switches_attempted += 1
                 current_model_id = self.current_model.get_config()['model_id']
-                logger.warning(f"🚨 Model {current_model_id} throttled during structured output (attempt {self.switches_attempted})")
+                logger.warning(f"Model {current_model_id} throttled during structured output (attempt {self.switches_attempted})")
                 
                 if self.switches_attempted > self.max_switches:
-                    logger.error(f"❌ Exceeded max model switches ({self.max_switches}) for structured output")
+                    logger.error(f"Exceeded max model switches ({self.max_switches}) for structured output")
                     raise Exception(f"All model switching attempts failed for structured output. Last error: {e}")
                 
                 # Put current model in cooldown
@@ -629,15 +644,15 @@ async def structured_output(
                 if next_model:
                     # Switch to next model
                     self._switch_to_model(next_model)
-                    logger.warning(f"🔄 IMMEDIATE MODEL SWITCH for structured output: {current_model_id} → {next_model}")
+                    logger.warning(f"Immediate model switch for structured output: {current_model_id} -> {next_model}")
                     continue
                 
                 # No more models available
-                logger.error(f"❌ No alternative models available for structured output")
+                logger.error("No alternative models available for structured output")
                 raise Exception(f"No alternative models available for structured output. Last error: {e}")
             
             except Exception as e:
-                logger.warning(f"🚨 Non-throttling error during structured output with model {self.current_model.get_config()['model_id']}: {e}")
+                logger.exception(f"Non-throttling error during structured output with model {self.current_model.get_config()['model_id']}")
                 raise e
         
         raise Exception(f"Structured output failed after {self.switches_attempted} model switches")
diff --git a/application_src/common/memory/elasticsearch.py b/application_src/common/memory/elasticsearch.py
index b2e88a6..7ce8cec 100644
--- a/application_src/common/memory/elasticsearch.py
+++ b/application_src/common/memory/elasticsearch.py
@@ -6,11 +6,11 @@
 """
 
 import logging
-from typing import Any
+from typing import Any, Dict, List
 
-from strands_tools.elasticsearch_memory import ElasticsearchMemoryTool
+from strands_tools.elasticsearch_memory import elasticsearch_memory
 
-from .base import BaseMemory
+from .base import BaseMemoryProvider as BaseMemory
 
 logger = logging.getLogger(__name__)
 
@@ -23,160 +23,95 @@ class ElasticsearchMemory(BaseMemory):
     Elasticsearch as the backend storage system.
     """
     
-    def __init__(self, config: list[dict[str, Any]]):
+    def __init__(self, config: Dict[str, Any]):
         """
         Initialize Elasticsearch memory provider.
         
         Args:
-            config: List of configuration dictionaries containing:
-                - cloud_id: Elasticsearch cloud deployment ID
-                - api_key: API key for authentication
-                - index_name: Index name for storing memories (optional)
-                - dimensions: Vector dimensions for embeddings (optional)
-        
-        Raises:
-            ValueError: If required configuration is missing
+            config: Configuration dictionary for the memory provider
         """
         super().__init__(config)
+        self.provider_name = "elasticsearch"
         
-        # Extract configuration parameters
-        config_dict = {item['name']: item['config'] for item in config}
-        
-        # Validate required parameters
-        required_params = ['cloud_id', 'api_key']
-        for param in required_params:
-            if param not in config_dict:
-                raise ValueError(f"Missing required parameter: {param}")
-        
-        # Extract configuration with defaults
-        cloud_id = config_dict['cloud_id']
-        api_key = config_dict['api_key']
-        index_name = config_dict.get('index_name', 'agent_memory')
-        dimensions = int(config_dict.get('dimensions', 1024))
-        
-        # Initialize the Elasticsearch memory tool
-        try:
-            self.memory_tool = ElasticsearchMemoryTool(
-                cloud_id=cloud_id,
-                api_key=api_key,
-                index_name=index_name,
-                dimensions=dimensions
-            )
-            logger.info(
-                f"Initialized Elasticsearch memory with index: {index_name}, "
-                f"dimensions: {dimensions}"
-            )
-        except Exception as e:
-            logger.error(f"Failed to initialize Elasticsearch memory: {e}")
-            raise
+        logger.info("Initialized Elasticsearch memory provider")
     
-    def save(self, session_id: str, human_message: str, ai_message: str) -> None:
-        """
-        Save conversation to Elasticsearch memory.
-        
-        Args:
-            session_id: Unique identifier for the conversation session
-            human_message: The user's message
-            ai_message: The AI's response
-        """
+    def initialize(self) -> list:
+        """Initialize the Elasticsearch memory provider and get the tools."""
         try:
-            # Construct memory entry with context
-            memory_text = f"User: {human_message}\nAssistant: {ai_message}"
+            # Get provider configuration
+            provider_config = self.get_provider_config()
             
-            # Metadata for filtering and organization
-            metadata = {
-                "session_id": session_id,
-                "type": "conversation",
-                "timestamp": self._get_timestamp()
-            }
+            if not provider_config:
+                logger.warning("No Elasticsearch configuration found")
+                return []
             
-            # Store in Elasticsearch
-            result = self.memory_tool.add_memory(
-                memory=memory_text,
-                metadata=metadata
-            )
+            # Create wrapped elasticsearch_memory tool
+            from strands import tool
             
-            logger.debug(
-                f"Saved conversation to Elasticsearch for session {session_id}: "
-                f"{result.get('message', 'Success')}"
-            )
+            @tool
+            def elasticsearch_memory_tool(action: str, content: str = None, query: str = None, 
+                                         session_id: str = None) -> Dict[str, Any]:
+                """
+                Elasticsearch memory tool for storing and retrieving information.
+                
+                Args:
+                    action: The action to perform ('store', 'retrieve', or 'clear')
+                    content: The content to store (for 'store' action)
+                    query: The query to search for (for 'retrieve' action)
+                    session_id: Session ID for organizing memories
+                    
+                Returns:
+                    Dictionary with the results of the operation
+                """
+                try:
+                    # Create function call parameters
+                    function_params = {
+                        "action": action,
+                        **provider_config
+                    }
+                    
+                    if action == "store" and content:
+                        function_params.update({
+                            "memory": content,
+                            "session_id": session_id or "default",
+                            "timestamp": ElasticsearchMemory._get_timestamp()
+                        })
+                    elif action == "retrieve" and query:
+                        function_params.update({
+                            "query": query,
+                            "n_results": 5,
+                            "session_id": session_id or "default"
+                        })
+                    elif action == "clear":
+                        function_params.update({
+                            "session_id": session_id or "default"
+                        })
+                    
+                    # Call elasticsearch_memory function
+                    result = elasticsearch_memory(function_params)
+                    
+                    logger.debug(f"Elasticsearch memory operation {action} completed")
+                    return result
+                    
+                except Exception as e:
+                    error_msg = f"Error in elasticsearch_memory tool: {str(e)}"
+                    logger.error(error_msg)
+                    return {"status": "error", "message": error_msg}
             
-        except Exception as e:
-            logger.error(f"Failed to save memory for session {session_id}: {e}")
-            raise
-    
-    def get_context(self, session_id: str, query: str | None = None) -> str:
-        """
-        Retrieve relevant context from Elasticsearch memory.
-        
-        Args:
-            session_id: Unique identifier for the conversation session
-            query: Optional search query for semantic retrieval
-        
-        Returns:
-            Formatted string containing relevant conversation history
-        """
-        try:
-            # Use query if provided, otherwise use session_id for filtering
-            search_query = query if query else f"session:{session_id}"
-            
-            # Retrieve memories from Elasticsearch
-            result = self.memory_tool.get_memories(
-                query=search_query,
-                n_results=5  # Configurable number of results
-            )
-            
-            memories = result.get('memories', [])
-            
-            if not memories:
-                logger.debug(f"No memories found for session {session_id}")
-                return ""
+            self.tools = [elasticsearch_memory_tool]
+            logger.info(f"Successfully created {len(self.tools)} Elasticsearch memory tools")
             
-            # Format memories for context
-            context_parts = []
-            for i, memory in enumerate(memories, 1):
-                memory_text = memory.get('memory', '')
-                context_parts.append(f"[Memory {i}]\n{memory_text}")
-            
-            context = "\n\n".join(context_parts)
-            logger.debug(
-                f"Retrieved {len(memories)} memories for session {session_id}"
-            )
-            
-            return context
+            return self.tools
             
         except Exception as e:
-            logger.error(
-                f"Failed to retrieve context for session {session_id}: {e}"
-            )
-            # Return empty context on error to allow conversation to continue
-            return ""
+            logger.error(f"Error initializing Elasticsearch memory provider: {str(e)}")
+            return []
     
-    def clear(self, session_id: str) -> None:
-        """
-        Clear memory for a specific session.
-        
-        Note: The strands-tools Elasticsearch memory tool clears ALL memories.
-        For session-specific clearing, we would need to implement filtering,
-        which is not currently supported by the underlying tool.
-        
-        Args:
-            session_id: Unique identifier for the conversation session
-        """
-        try:
-            logger.warning(
-                f"Clearing ALL Elasticsearch memories (session-specific "
-                f"clearing not supported by strands-tools). "
-                f"Session ID: {session_id}"
-            )
-            
-            result = self.memory_tool.clear_memories()
-            
-            logger.info(f"Cleared Elasticsearch memories: {result.get('message')}")
-            
-        except Exception as e:
-            logger.error(f"Failed to clear memories for session {session_id}: {e}")
-            raise
+    def get_tools(self) -> List:
+        """Get memory tools - following the same pattern as other providers."""
+        if not hasattr(self, 'tools') or not self.tools:
+            return self.initialize()
+        return self.tools
     
     @staticmethod
     def _get_timestamp() -> str:
diff --git a/application_src/common/observability/__init__.py b/application_src/common/observability/__init__.py
index 1ec42fd..62024d0 100644
--- a/application_src/common/observability/__init__.py
+++ b/application_src/common/observability/__init__.py
@@ -3,10 +3,16 @@
 This module provides a factory for creating observability providers.
 """
 
+import logging
+
 from config import Config
 from .base import BaseObservabilityProvider
 from .langfuse import LangfuseObservabilityProvider
 from .dynatrace import DynatraceObservabilityProvider
+from .elastic import ElasticObservabilityProvider
+
+logger = logging.getLogger(__name__)
+
 
 class ObservabilityFactory:
     """Factory for creating observability providers."""
@@ -14,45 +20,49 @@ class ObservabilityFactory:
     @staticmethod
     def create(agent_name="qa_agent"):
         """Create an observability provider based on configuration."""
-        print(f"🏭 ObservabilityFactory.create() called for agent: {agent_name}")
+        logger.debug(f"ObservabilityFactory.create() called for agent: {agent_name}")
         
         # Create a config instance with the specified agent_name
         agent_config = Config(agent_name)
         obs_config = agent_config.get_observability_config()
         
-        print(f"📋 Observability config for {agent_name}: {obs_config}")
+        logger.debug(f"Observability config for {agent_name}: enabled={obs_config.get('enabled')}, provider={obs_config.get('provider')}")
         
         if not obs_config["enabled"]:
-            print("❌ Observability is disabled")
+            logger.info("Observability is disabled")
             return None
         
         provider = obs_config.get("provider")
         
         if not provider:
-            print("❌ No observability provider specified, disabling observability")
+            logger.warning("No observability provider specified, disabling observability")
             return None
         
         provider = provider.lower()
-        print(f"🔧 Creating observability provider: {provider}")
+        logger.info(f"Creating observability provider: {provider}")
         
         if provider == "langfuse":
-            print("✅ Creating Langfuse observability provider")
+            logger.debug("Creating Langfuse observability provider")
             return LangfuseObservabilityProvider(obs_config)
         elif provider == "dynatrace":
-            print("✅ Creating Dynatrace observability provider")
+            logger.debug("Creating Dynatrace observability provider")
             return DynatraceObservabilityProvider(obs_config)
+        elif provider == "elastic":
+            logger.debug("Creating Elastic observability provider")
+            return ElasticObservabilityProvider(obs_config)
         else:
-            print(f"❌ Unknown observability provider: {provider}")
+            logger.error(f"Unknown observability provider: {provider}")
             return None
 
+
 def get_trace_attributes(agent_name="qa_agent"):
     """Get trace attributes for use with Strands Agent."""
-    print(f"🔍 Getting trace attributes for agent: {agent_name}...")
+    logger.debug(f"Getting trace attributes for agent: {agent_name}")
     obs_provider = ObservabilityFactory.create(agent_name)
     if obs_provider:
         trace_attrs = obs_provider.get_trace_attributes()
-        print(f"✅ Trace attributes retrieved: {trace_attrs}")
+        logger.debug(f"Trace attributes retrieved: {trace_attrs}")
         return trace_attrs
     else:
-        print("❌ No observability provider available")
+        logger.warning("No observability provider available")
         return {}
diff --git a/application_src/common/observability/elastic.py b/application_src/common/observability/elastic.py
new file mode 100644
index 0000000..b173dba
--- /dev/null
+++ b/application_src/common/observability/elastic.py
@@ -0,0 +1,236 @@
+"""
+Elastic observability provider for GenAI-In-A-Box agent.
+This module provides an observability provider for Elastic with proper OpenTelemetry initialization.
+"""
+
+import logging
+import os
+import uuid
+from typing import Dict, Any
+from .base import BaseObservabilityProvider
+
+logger = logging.getLogger(__name__)
+
+
+class ElasticObservabilityProvider(BaseObservabilityProvider):
+    """Observability provider for Elastic."""
+    
+    def __init__(self, config: Dict[str, Any]):
+        """Initialize the Elastic observability provider."""
+        super().__init__(config)
+        self.provider_name = "elastic"
+    
+    def initialize(self) -> Dict[str, Any]:
+        """Initialize the Elastic observability provider and get the trace attributes."""
+        try:
+            provider_config = self.get_provider_config()
+            
+            # Get Elastic configuration
+            api_key = provider_config.get("api_key", "")
+            otlp_endpoint = provider_config.get("otlp_endpoint", "")
+            
+            logger.debug("Elastic credentials check:")
+            logger.debug(f"   API Key: {'Present' if api_key else 'Missing'}")
+            logger.debug(f"   OTLP Endpoint: {'Configured' if otlp_endpoint else 'Missing'}")
+            
+            if not api_key:
+                logger.error("Elastic API key (api_key) is required")
+                return {}
+                
+            if not otlp_endpoint:
+                logger.error("Elastic OTLP endpoint (otlp_endpoint) is required")
+                return {}
+            
+            # Set up environment variables for Elastic (CRITICAL for Strands integration)
+            os.environ["ELASTIC_API_KEY"] = api_key
+            os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = otlp_endpoint
+            os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=ApiKey {api_key}"
+            
+            logger.info("Elastic environment variables configured successfully")
+            
+            # CRITICAL: Initialize OpenTelemetry for Elastic
+            try:
+                self._initialize_opentelemetry(otlp_endpoint, api_key)
+                logger.info("OpenTelemetry initialized successfully for Elastic")
+            except Exception as otel_error:
+                logger.warning(f"OpenTelemetry initialization failed: {otel_error}")
+                logger.warning("Traces will not be sent to Elastic")
+                # Don't return empty dict - still provide trace attributes for debugging
+            
+            # Get service name from config or environment
+            # Priority: agent_name from config > AGENT_NAME env var > SERVICE_NAME env var > default
+            service_name = (
+                self.config.get("agent_name") or 
+                os.environ.get('AGENT_NAME') or 
+                os.environ.get('SERVICE_NAME') or 
+                'genai-in-a-box'
+            )
+            
+            # Get service version from config or environment
+            service_version = (
+                self.config.get("agent_version") or 
+                os.environ.get('SERVICE_VERSION') or 
+                '1.0.0'
+            )
+            
+            # Get optional dataset routing configuration
+            dataset = provider_config.get("dataset", "generic.otel")
+            namespace = provider_config.get("namespace", "default")
+            
+            self.trace_attributes = {
+                "session.id": f"{service_name}-session-{uuid.uuid4()}",
+                "user.id": f"{service_name}-user",
+                "service.name": service_name,
+                "service.version": service_version,
+                "deployment.environment": os.environ.get('ENVIRONMENT', 'production'),
+                "data_stream.dataset": dataset,
+                "data_stream.namespace": namespace
+            }
+            
+            logger.info("Elastic observability provider initialized successfully")
+            logger.debug(f"Trace attributes: {self.trace_attributes}")
+            return self.trace_attributes
+            
+        except Exception as e:
+            logger.exception("Error initializing Elastic observability provider")
+            return {}
+    
+    def _initialize_opentelemetry(self, otlp_endpoint: str, api_key: str):
+        """Initialize OpenTelemetry with OTLP exporter for Elastic."""
+        try:
+            # Import OpenTelemetry components
+            from opentelemetry import trace
+            from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+            from opentelemetry.sdk.trace import TracerProvider
+            from opentelemetry.sdk.trace.export import BatchSpanProcessor
+            from opentelemetry.sdk.resources import Resource
+            
+            logger.debug("OpenTelemetry packages imported successfully")
+            
+            # Get dataset and namespace from config
+            provider_config = self.get_provider_config()
+            dataset = provider_config.get("dataset", "generic.otel")
+            namespace = provider_config.get("namespace", "default")
+            
+            # Get service name and version (same logic as trace_attributes)
+            service_name = (
+                self.config.get("agent_name") or 
+                os.environ.get('AGENT_NAME') or 
+                os.environ.get('SERVICE_NAME') or 
+                'genai-in-a-box'
+            )
+            service_version = (
+                self.config.get("agent_version") or 
+                os.environ.get('SERVICE_VERSION') or 
+                '1.0.0'
+            )
+            
+            # Create resource with service information
+            # Include data stream routing attributes for Elastic
+            resource = Resource.create({
+                "service.name": service_name,
+                "service.version": service_version,
+                "deployment.environment": os.environ.get('ENVIRONMENT', 'production'),
+                "data_stream.dataset": dataset,
+                "data_stream.namespace": namespace
+            })
+            
+            # Set up tracer provider
+            tracer_provider = TracerProvider(resource=resource)
+            trace.set_tracer_provider(tracer_provider)
+            
+            logger.debug("TracerProvider configured")
+            
+            # Create OTLP exporter with Elastic API Key authentication
+            headers = {"Authorization": f"ApiKey {api_key}"}
+            
+            logger.debug(f"Data Stream: traces-{dataset}-{namespace}")
+            logger.debug("OTLP Endpoint Configuration:")
+            logger.debug(f"   Base endpoint from config: {otlp_endpoint}")
+            
+            # Ensure the endpoint has the correct OTLP traces path
+            # Elastic OTLP endpoint should end with /v1/traces
+            if not otlp_endpoint.endswith('/v1/traces'):
+                if otlp_endpoint.endswith('/'):
+                    otlp_endpoint = otlp_endpoint + 'v1/traces'
+                else:
+                    otlp_endpoint = otlp_endpoint + '/v1/traces'
+            
+            logger.debug(f"   Final OTLP traces endpoint: {otlp_endpoint}")
+            logger.debug("   Headers: Authorization=ApiKey [REDACTED]")
+            
+            otlp_exporter = OTLPSpanExporter(
+                endpoint=otlp_endpoint,
+                headers=headers
+            )
+            
+            logger.debug("OTLP Exporter created successfully")
+            logger.debug(f"   Exporter endpoint: {otlp_exporter._endpoint}")
+            logger.debug(f"   Exporter will send traces to: {otlp_endpoint}")
+            
+            # Wrap the exporter to add detailed error logging and resilience
+            class ResilientOTLPSpanExporter:
+                def __init__(self, wrapped_exporter):
+                    self._wrapped = wrapped_exporter
+                    self._failed_exports = 0
+                    self._max_failures = 5  # Stop trying after 5 consecutive failures
+                    
+                def export(self, spans):
+                    # Skip export if we've had too many failures
+                    if self._failed_exports >= self._max_failures:
+                        from opentelemetry.sdk.trace.export import SpanExportResult
+                        logger.warning(f"OTLP Export Skipped: Too many consecutive failures ({self._failed_exports})")
+                        return SpanExportResult.FAILURE
+                    
+                    try:
+                        logger.debug("OTLP Export Debug:")
+                        logger.debug(f"   Sending {len(spans)} spans to: {self._wrapped._endpoint}")
+                        logger.debug(f"   Request headers: {self._wrapped._headers}")
+                        result = self._wrapped.export(spans)
+                        logger.debug(f"   Export result: {result}")
+                        
+                        # Reset failure counter on success
+                        if result.name == 'SUCCESS':
+                            self._failed_exports = 0
+                        else:
+                            self._failed_exports += 1
+                            logger.warning(f"Export failed, failure count: {self._failed_exports}")
+                            
+                        return result
+                    except Exception as e:
+                        self._failed_exports += 1
+                        logger.error(f"OTLP Export Error (failure {self._failed_exports}/{self._max_failures}):")
+                        logger.error(f"   Error type: {type(e).__name__}")
+                        logger.error(f"   Error message: {str(e)}")
+                        logger.error(f"   Endpoint attempted: {self._wrapped._endpoint}")
+                        
+                        # Only log full traceback for first few failures to reduce log spam
+                        if self._failed_exports <= 3:
+                            logger.exception("OTLP Export Exception details:")
+                        
+                        # Return failure instead of raising to prevent crash
+                        from opentelemetry.sdk.trace.export import SpanExportResult
+                        return SpanExportResult.FAILURE
+                        
+                def shutdown(self):
+                    return self._wrapped.shutdown()
+                    
+                def force_flush(self, timeout_millis: int = 30000):
+                    return self._wrapped.force_flush(timeout_millis)
+            
+            # Wrap the exporter for resilience and debugging
+            resilient_exporter = ResilientOTLPSpanExporter(otlp_exporter)
+            
+            # Add span processor with resilient exporter
+            span_processor = BatchSpanProcessor(resilient_exporter)
+            tracer_provider.add_span_processor(span_processor)
+            
+            logger.info("OpenTelemetry configured with OTLP exporter for Elastic")
+            
+        except ImportError as import_error:
+            logger.error(f"Missing OpenTelemetry dependencies: {import_error}")
+            logger.error("Install with: pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp")
+            raise
+        except Exception as setup_error:
+            logger.exception("OpenTelemetry setup failed")
+            raise
diff --git a/application_src/configuration-api/app/api/config.py b/application_src/configuration-api/app/api/config.py
index 50c7ffa..bddf9ae 100644
--- a/application_src/configuration-api/app/api/config.py
+++ b/application_src/configuration-api/app/api/config.py
@@ -165,7 +165,7 @@ async def get_agent_config(
         result = agent_service.load_agent_configuration(agent_name)
         
         # Convert result to dict for manipulation
-        config_dict = result.dict() if hasattr(result, 'dict') else dict(result)
+        config_dict = result.model_dump(mode='json') if hasattr(result, 'model_dump') else dict(result)
         
         return config_dict
 
@@ -735,6 +735,71 @@ async def create_system_prompt(
         raise HTTPException(status_code=500, detail="Internal server error occurred")
 
 
+@config_router.post('/update-deployment/{agent_name}')
+async def update_agent_deployment(
+    agent_name: str,
+    current_user: UserInfo = Depends(get_current_user),
+    _: None = Depends(RequirePermission("config:update"))
+) -> Dict[str, Any]:
+    """
+    Trigger CloudFormation stack update to force new ECS task deployment.
+    
+    This endpoint updates the CloudFormation stack for the agent, which will:
+    1. Force ECS to deploy new tasks with latest configuration
+    2. Maintain zero-downtime deployment
+    3. Pull latest container images if updated
+    
+    Args:
+        agent_name: Name of the agent to update deployment for
+        
+    Returns:
+        Dictionary with update status and stack information
+        
+    Raises:
+        HTTPException: If update fails or agent stack not found
+    """
+    try:
+        logger.info(f"Triggering deployment update for agent: {agent_name}")
+        
+        # Get deployment service
+        from ..services import DeploymentService
+        from ..utils.dependencies import get_deployment_service
+        
+        deployment_service = get_deployment_service()
+        
+        # Update the agent stack to force new deployment
+        update_result = await deployment_service.update_agent_stack(
+            agent_name=agent_name,
+            parameters=None  # Use existing parameters, just trigger update
+        )
+        
+        logger.info(f"Successfully triggered deployment update for agent: {agent_name}")
+        
+        return {
+            "status": "success",
+            "message": f"Deployment update initiated for agent '{agent_name}'",
+            "agent_name": agent_name,
+            "stack_name": update_result.get("stack_name"),
+            "stack_status": update_result.get("status"),
+            "details": update_result
+        }
+        
+    except ValueError as e:
+        # Agent stack not found
+        log_exception_safely(logger, e, f"Agent stack not found for '{agent_name}'")
+        raise HTTPException(
+            status_code=404,
+            detail=f"No CloudFormation stack found for agent '{agent_name}'"
+        )
+    except Exception as e:
+        logger.error(f"Error updating deployment for agent '{agent_name}'")
+        log_exception_safely(logger, e, f"Error updating deployment for '{agent_name}'")
+        raise HTTPException(
+            status_code=500,
+            detail="Internal server error during deployment update"
+        )
+
+
 @config_router.post('/refresh-agent/{agent_name}')
 async def refresh_agent_instances(
     agent_name: str,
@@ -819,7 +884,7 @@ async def refresh_agent_instances(
                     # Check if this agent matches our target
                     if current_agent_name == agent_name:
                         matching_agents_found += 1
-                        logger.info(f"✅ Found matching agent '{agent_name}' at {agent_url}, triggering refresh")
+                        logger.info(f"Found matching agent '{agent_name}' at {agent_url}, triggering refresh")
                         
                         # Call the agent's /config/load endpoint with its own name  
                         load_url = f"{agent_url.rstrip('/')}/config/load"
@@ -839,7 +904,7 @@ async def refresh_agent_instances(
                                 "timestamp": response_data.get("timestamp", "unknown")
                             }
                             successful_refreshes.append(agent_url)
-                            logger.info(f"✅ Successfully refreshed agent '{agent_name}' at {agent_url}")
+                            logger.info(f"Successfully refreshed agent '{agent_name}' at {agent_url}")
                             
                         else:
                             refresh_results[agent_url] = {
@@ -851,7 +916,7 @@ async def refresh_agent_instances(
                                 "timestamp": None
                             }
                             failed_refreshes.append({"url": agent_url, "reason": "refresh failed"})
-                            logger.error(f"❌ Agent '{agent_name}' at {agent_url} refresh failed")
+                            logger.error(f"Agent '{agent_name}' at {agent_url} refresh failed")
                     else:
                         # Agent doesn't match, skip it
                         refresh_results[agent_url] = {
@@ -861,7 +926,7 @@ async def refresh_agent_instances(
                             "checked": True,
                             "matches_target": False
                         }
-                        logger.info(f"⏭️ Agent '{current_agent_name}' at {agent_url} does not match target '{agent_name}', skipped")
+                        logger.info(f"Agent '{current_agent_name}' at {agent_url} does not match target '{agent_name}', skipped")
                         
                 except httpx.TimeoutException as e:
                     log_exception_safely(logger, e, f"Timeout checking agent at {agent_url}")
@@ -873,7 +938,7 @@ async def refresh_agent_instances(
                         "error": "Request timeout after 30 seconds"
                     }
                     failed_refreshes.append({"url": agent_url, "reason": "timeout"})
-                    logger.error(f"⏰ Agent at {agent_url} timed out")
+                    logger.error(f"Agent at {agent_url} timed out")
                     
                 except httpx.RequestError as e:
                     log_exception_safely(logger, e, f"Request error checking agent at {agent_url}")
diff --git a/application_src/configuration-api/app/api/deployment.py b/application_src/configuration-api/app/api/deployment.py
index 8d95dd0..30b9993 100644
--- a/application_src/configuration-api/app/api/deployment.py
+++ b/application_src/configuration-api/app/api/deployment.py
@@ -13,9 +13,9 @@
 from fastapi import APIRouter, HTTPException, BackgroundTasks, Depends, Body
 from pydantic import BaseModel, Field
 
-from app.services.cloudformation_deployment_service import CloudFormationDeploymentService
+from app.services.deployment_service import DeploymentService
 from app.services.agent_config_service import AgentConfigService
-from app.utils.dependencies import get_cloudformation_deployment_service, get_agent_config_service
+from app.utils.dependencies import get_deployment_service, get_agent_config_service
 from app.models import AgentConfigRequest
 
 # Authentication middleware imports
@@ -65,7 +65,7 @@ async def deploy_agent_stack(
     background_tasks: BackgroundTasks,
     current_user: UserInfo = Depends(get_current_user),
     _: None = Depends(RequirePermission("agent:deploy")),
-    cfn_service: CloudFormationDeploymentService = Depends(get_cloudformation_deployment_service)
+    deployment_service: DeploymentService = Depends(get_deployment_service)
 ):
     """
     Deploy a new agent stack using CloudFormation API.
@@ -100,11 +100,14 @@ async def deploy_agent_stack(
                 detail="Agent name must contain only letters, numbers, underscores, and hyphens"
             )
         
-        # Deploy the stack using CloudFormation
-        result = cfn_service.deploy_agent_stack(
-            agent_name=request.agent_name,
-            parameters=request.parameters,
-            timeout_minutes=request.timeout_minutes
+        # Deploy the stack using CloudFormation template from S3
+        # Use consistent naming pattern throughout all operations
+        new_stack_name = deployment_service.get_stack_name_from_agent(request.agent_name)
+        
+        result = await deployment_service.create_agent_stack(
+            new_agent_name=request.agent_name,
+            new_stack_name=new_stack_name,
+            model_config=None  # Will read from SSM
         )
         
         logger.info(f"Successfully deployed stack: {result['stack_name']}")
@@ -136,14 +139,14 @@ async def get_stack_status(
     agent_name: str,
     current_user: UserInfo = Depends(get_current_user),
     _: None = Depends(RequirePermission("agent:read")),
-    cfn_service: CloudFormationDeploymentService = Depends(get_cloudformation_deployment_service)
+    deployment_service: DeploymentService = Depends(get_deployment_service)
 ):
     """
     Get the current status of an agent's CloudFormation stack.
     
     Args:
         agent_name: Name of the agent
-        cfn_service: Injected CloudFormation deployment service
+        deployment_service: Injected deployment service
         
     Returns:
         Current stack status and information
@@ -154,19 +157,21 @@ async def get_stack_status(
     try:
         logger.info(f"Retrieving status for agent: {agent_name}")
         
-        status_info = cfn_service.get_stack_info(agent_name)
+        # Find the stack for this agent first
+        stack_info = await deployment_service.find_agent_stack_by_name(agent_name)
+        
+        if not stack_info:
+            raise HTTPException(status_code=404, detail="Agent stack not found")
+        
+        status_info = await deployment_service.get_stack_status(stack_info['stack_name'])
         
         return StackStatusResponse(**status_info)
         
-    except RuntimeError as e:
+    except HTTPException:
+        raise
+    except Exception as e:
         logger.error("Error retrieving stack status")
         log_exception_safely(logger, "Error retrieving stack status", e)
-        if 'not found' in str(e).lower():
-            raise HTTPException(status_code=404, detail="Agent stack not found")
-        raise HTTPException(status_code=500, detail="Failed to retrieve stack status")
-    except Exception as e:
-        logger.error("Unexpected error retrieving stack status")
-        log_exception_safely(logger, "Unexpected error retrieving stack status", e)
         raise HTTPException(status_code=500, detail="Failed to retrieve stack status")
 
 
@@ -174,13 +179,13 @@ async def get_stack_status(
 async def list_agent_stacks(
     current_user: UserInfo = Depends(get_current_user),
     _: None = Depends(RequirePermission("agent:read")),
-    cfn_service: CloudFormationDeploymentService = Depends(get_cloudformation_deployment_service)
+    deployment_service: DeploymentService = Depends(get_deployment_service)
 ):
     """
     List all agent stacks with their current status.
     
     Args:
-        cfn_service: Injected CloudFormation deployment service
+        deployment_service: Injected deployment service
         
     Returns:
         List of agent stacks with status information
@@ -191,21 +196,17 @@ async def list_agent_stacks(
     try:
         logger.info("Retrieving list of agent stacks")
         
-        stacks = cfn_service.list_agent_stacks()
+        stacks = await deployment_service.list_agent_stacks()
         
         return {
             "stacks": stacks,
             "count": len(stacks)
         }
         
-    except RuntimeError as e:
+    except Exception as e:
         logger.error("Error listing agent stacks")
         log_exception_safely(logger, "Error listing agent stacks", e)
         raise HTTPException(status_code=500, detail="Failed to list agent stacks")
-    except Exception as e:
-        logger.error("Unexpected error listing agent stacks")
-        log_exception_safely(logger, "Unexpected error listing agent stacks", e)
-        raise HTTPException(status_code=500, detail="Failed to list stacks")
 
 
 @deployment_router.delete("/stack/{agent_name}")
@@ -214,7 +215,7 @@ async def delete_agent_stack(
     timeout_minutes: int = 30,
     current_user: UserInfo = Depends(get_current_user),
     _: None = Depends(RequirePermission("agent:delete")),
-    cfn_service: CloudFormationDeploymentService = Depends(get_cloudformation_deployment_service)
+    deployment_service: DeploymentService = Depends(get_deployment_service)
 ):
     """
     Delete an agent stack using CloudFormation API.
@@ -222,7 +223,7 @@ async def delete_agent_stack(
     Args:
         agent_name: Name of the agent
         timeout_minutes: Maximum time to wait for deletion
-        cfn_service: Injected CloudFormation deployment service
+        deployment_service: Injected deployment service
         
     Returns:
         Deletion confirmation
@@ -233,29 +234,27 @@ async def delete_agent_stack(
     try:
         logger.info(f"Deleting stack for agent: {agent_name}")
         
-        result = cfn_service.delete_agent_stack(
-            agent_name=agent_name,
-            timeout_minutes=timeout_minutes
-        )
+        # Find the stack for this agent first
+        stack_info = await deployment_service.find_agent_stack_by_name(agent_name)
+        
+        if not stack_info:
+            raise HTTPException(status_code=404, detail="Agent stack not found")
+        
+        result = await deployment_service.delete_stack(stack_info['stack_name'])
         
         return {
             "message": f"Stack deletion completed for agent: {agent_name}",
             "stack_name": result['stack_name'],
-            "agent_name": result['agent_name'],
-            "status": result['status'],
-            "deleted_at": result['deleted_at']
+            "agent_name": agent_name,
+            "status": result['status']
         }
         
-    except RuntimeError as e:
+    except HTTPException:
+        raise
+    except Exception as e:
         logger.error("Error deleting stack")
         log_exception_safely(logger, "Error deleting stack", e)
-        if 'not found' in str(e).lower():
-            raise HTTPException(status_code=404, detail="Agent stack not found")
         raise HTTPException(status_code=500, detail="Failed to delete agent stack")
-    except Exception as e:
-        logger.error("Unexpected error deleting stack")
-        log_exception_safely(logger, "Unexpected error deleting stack", e)
-        raise HTTPException(status_code=500, detail="Failed to delete stack")
 
 
 @deployment_router.put("/stack/{agent_name}")
@@ -265,7 +264,7 @@ async def update_agent_stack(
     timeout_minutes: int = 30,
     current_user: UserInfo = Depends(get_current_user),
     _: None = Depends(RequirePermission("agent:update")),
-    cfn_service: CloudFormationDeploymentService = Depends(get_cloudformation_deployment_service)
+    deployment_service: DeploymentService = Depends(get_deployment_service)
 ):
     """
     Update an existing agent stack using CloudFormation API.
@@ -274,7 +273,7 @@ async def update_agent_stack(
         agent_name: Name of the agent
         parameters: CloudFormation parameters for update
         timeout_minutes: Maximum time to wait for update
-        cfn_service: Injected CloudFormation deployment service
+        deployment_service: Injected deployment service
         
     Returns:
         Update confirmation
@@ -285,10 +284,9 @@ async def update_agent_stack(
     try:
         logger.info(f"Updating stack for agent: {agent_name}")
         
-        result = cfn_service.update_agent_stack(
+        result = await deployment_service.update_agent_stack(
             agent_name=agent_name,
-            parameters=parameters,
-            timeout_minutes=timeout_minutes
+            parameters=parameters
         )
         
         return {
@@ -298,22 +296,13 @@ async def update_agent_stack(
             "outputs": result.get('outputs', {})
         }
         
-    except RuntimeError as e:
+    except ValueError as e:
+        log_exception_safely(logger, e, "Agent stack not found")
+        raise HTTPException(status_code=404, detail="Agent stack not found")
+    except Exception as e:
         logger.error("Error updating stack")
         log_exception_safely(logger, "Error updating stack", e)
-        if 'not found' in str(e).lower():
-            raise HTTPException(status_code=404, detail="Agent stack not found")
-        if 'No updates' in str(e):
-            return {
-                "message": f"No updates needed for agent: {agent_name}",
-                "stack_name": cfn_service._get_stack_name(agent_name),
-                "status": "UP_TO_DATE"
-            }
         raise HTTPException(status_code=500, detail="Failed to update agent stack")
-    except Exception as e:
-        logger.error("Unexpected error updating stack")
-        log_exception_safely(logger, "Unexpected error updating stack", e)
-        raise HTTPException(status_code=500, detail="Failed to update stack")
 
 
 @deployment_router.post("/create-agent")
@@ -322,12 +311,12 @@ async def create_agent(
     background_tasks: BackgroundTasks = BackgroundTasks(),
     current_user: UserInfo = Depends(get_current_user),
     _: None = Depends(RequirePermission("agent:deploy")),
-    cfn_service: CloudFormationDeploymentService = Depends(get_cloudformation_deployment_service)
+    deployment_service: DeploymentService = Depends(get_deployment_service)
 ):
     """
     Deploy CloudFormation stack for an agent asynchronously.
     
-    This endpoint initiates CloudFormation stack creation in the background
+    This endpoint initiates CloudFormation stack creation using DeploymentService
     and returns immediately, preventing health check failures.
     
     The UI workflow is:
@@ -339,7 +328,7 @@ async def create_agent(
         request: Request body with:
             - new_agent_name: Name of the agent to deploy
         background_tasks: FastAPI background tasks
-        cfn_service: Injected CloudFormation deployment service
+        deployment_service: Injected deployment service
         
     Returns:
         Immediate response with deployment initiation status
@@ -363,30 +352,13 @@ async def create_agent(
                 detail="Agent name must contain only letters, numbers, underscores, and hyphens"
             )
         
-        # Initiate CloudFormation stack creation (non-blocking)
-        cfn_parameters = {
-            "AgentName": agent_name
-        }
+        # Use consistent naming pattern throughout all operations
+        new_stack_name = deployment_service.get_stack_name_from_agent(agent_name)
         
-        stack_name = cfn_service._get_stack_name(agent_name)
-        
-        # Download template and create stack (fast operations)
-        template_body = cfn_service._download_template("GenericAgentTemplate.json")
-        cfn_parameters_list = cfn_service._convert_parameters(cfn_parameters)
-        
-        # Create stack (returns immediately, doesn't wait for completion)
-        cfn_service.cfn_client.create_stack(
-            StackName=stack_name,
-            TemplateBody=template_body,
-            Parameters=cfn_parameters_list,
-            Capabilities=['CAPABILITY_IAM', 'CAPABILITY_NAMED_IAM'],
-            Tags=[
-                {'Key': 'ManagedBy', 'Value': 'ConfigurationAPI'},
-                {'Key': 'ProjectName', 'Value': cfn_service.project_name},
-                {'Key': 'AgentName', 'Value': agent_name},
-                {'Key': 'DeployedAt', 'Value': datetime.now().isoformat()}
-            ],
-            TimeoutInMinutes=30
+        result = await deployment_service.create_agent_stack(
+            new_agent_name=agent_name,
+            new_stack_name=new_stack_name,
+            model_config=None  # Will read from SSM
         )
         
         logger.info(f"Stack creation initiated for agent: {agent_name}")
@@ -395,8 +367,8 @@ async def create_agent(
             "status": "initiated",
             "message": f"Infrastructure deployment initiated for agent '{agent_name}'",
             "agent_name": agent_name,
-            "stack_name": stack_name,
-            "deployment_status": "CREATE_IN_PROGRESS",
+            "stack_name": new_stack_name,
+            "deployment_status": result['status'],
             "outputs": {},
             "deployed_at": datetime.now().isoformat()
         }
@@ -404,11 +376,7 @@ async def create_agent(
     except ValueError as e:
         logger.error("Validation error creating agent")
         log_exception_safely(logger, "Validation error creating agent", e)
-        raise HTTPException(status_code=400, detail="Invalid agent creation parameters")
-    except RuntimeError as e:
-        logger.error("Runtime error creating agent")
-        log_exception_safely(logger, "Runtime error creating agent", e)
-        raise HTTPException(status_code=500, detail="Agent creation failed")
+        raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
         logger.error("Error creating agent")
         log_exception_safely(logger, "Error creating agent", e)
diff --git a/application_src/configuration-api/app/models/agent_config.py b/application_src/configuration-api/app/models/agent_config.py
index a6dd44b..cb61660 100644
--- a/application_src/configuration-api/app/models/agent_config.py
+++ b/application_src/configuration-api/app/models/agent_config.py
@@ -71,6 +71,10 @@ class AgentConfigRequest(BaseModel):
     
     # Model configuration
     model_id: str = Field(..., description="Primary model identifier")
+    model_ids: Optional[List[str]] = Field(
+        default=None,
+        description="Multiple model identifiers for dynamic model switching"
+    )
     judge_model_id: str = Field(..., description="Judge model identifier")
     embedding_model_id: str = Field(..., description="Embedding model identifier")
     region_name: str = Field(..., description="AWS region name")
@@ -162,6 +166,10 @@ class AgentConfigResponse(BaseModel):
     
     # Model configuration
     model_id: str = Field(..., description="Primary model identifier")
+    model_ids: Optional[List[str]] = Field(
+        default=None,
+        description="Multiple model identifiers for dynamic model switching"
+    )
     judge_model_id: str = Field(..., description="Judge model identifier")
     embedding_model_id: str = Field(..., description="Embedding model identifier")
     region_name: str = Field(..., description="AWS region name")
diff --git a/application_src/configuration-api/app/models/form_schema.py b/application_src/configuration-api/app/models/form_schema.py
index 619a589..f6fdb5a 100644
--- a/application_src/configuration-api/app/models/form_schema.py
+++ b/application_src/configuration-api/app/models/form_schema.py
@@ -722,25 +722,13 @@ def get_observability_schemas() -> Dict[str, ProviderFormSchema]:
                 provider_label="Langfuse",
                 description="Open-source LLM observability and analytics platform",
                 fields=[
-                    FormField(
-                        name="enabled",
-                        type=FieldType.CHECKBOX,
-                        label="Enable Observability Integration",
-                        help_text="Enable Langfuse observability and analytics",
-                        required=False,
-                        default_value=False
-                    ),
                     FormField(
                         name="public_key",
                         type=FieldType.TEXT,
                         label="Langfuse Public Key",
                         placeholder="Enter Langfuse public key",
                         help_text="Your Langfuse project public key",
-                        required=True,
-                        conditional={
-                            "field": "enabled",
-                            "value": True
-                        }
+                        required=True
                     ),
                     FormField(
                         name="secret_key",
@@ -749,11 +737,7 @@ def get_observability_schemas() -> Dict[str, ProviderFormSchema]:
                         placeholder="Enter Langfuse secret key",
                         help_text="Your Langfuse project secret key",
                         required=True,
-                        secure=True,
-                        conditional={
-                            "field": "enabled",
-                            "value": True
-                        }
+                        secure=True
                     ),
                     FormField(
                         name="host",
@@ -762,11 +746,7 @@ def get_observability_schemas() -> Dict[str, ProviderFormSchema]:
                         placeholder="https://cloud.langfuse.com",
                         help_text="Langfuse host URL (defaults to cloud.langfuse.com)",
                         required=False,
-                        default_value="https://cloud.langfuse.com",
-                        conditional={
-                            "field": "enabled",
-                            "value": True
-                        }
+                        default_value="https://cloud.langfuse.com"
                     )
                 ]
             ),
@@ -776,25 +756,13 @@ def get_observability_schemas() -> Dict[str, ProviderFormSchema]:
                 provider_label="Dynatrace",
                 description="Full-stack observability platform",
                 fields=[
-                    FormField(
-                        name="enabled",
-                        type=FieldType.CHECKBOX,
-                        label="Enable Observability Integration",
-                        help_text="Enable Dynatrace observability and monitoring",
-                        required=False,
-                        default_value=False
-                    ),
                     FormField(
                         name="environment_url",
                         type=FieldType.URL,
                         label="Dynatrace Environment URL",
                         placeholder="https://your-environment.live.dynatrace.com",
                         help_text="Your Dynatrace environment URL",
-                        required=True,
-                        conditional={
-                            "field": "enabled",
-                            "value": True
-                        }
+                        required=True
                     ),
                     FormField(
                         name="api_token",
@@ -803,11 +771,50 @@ def get_observability_schemas() -> Dict[str, ProviderFormSchema]:
                         placeholder="Enter Dynatrace API token",
                         help_text="API token with appropriate permissions",
                         required=True,
-                        secure=True,
-                        conditional={
-                            "field": "enabled",
-                            "value": True
-                        }
+                        secure=True
+                    )
+                ]
+            ),
+            
+            "elastic": ProviderFormSchema(
+                provider_name="elastic",
+                provider_label="Elastic Observability",
+                description="Elastic Cloud Managed OTLP Endpoint for OpenTelemetry-based observability",
+                fields=[
+                    FormField(
+                        name="otlp_endpoint",
+                        type=FieldType.URL,
+                        label="Elastic OTLP Endpoint",
+                        placeholder="https://your-cluster.elastic-cloud.com:443",
+                        help_text="Your Elastic Cloud Managed OTLP endpoint URL (found in Elastic Cloud console)",
+                        required=True
+                    ),
+                    FormField(
+                        name="api_key",
+                        type=FieldType.PASSWORD,
+                        label="Elastic API Key",
+                        placeholder="Enter Elastic API key",
+                        help_text="API key for authentication with Elastic Cloud",
+                        required=True,
+                        secure=True
+                    ),
+                    FormField(
+                        name="dataset",
+                        type=FieldType.TEXT,
+                        label="Data Stream Dataset (Optional)",
+                        placeholder="generic.otel",
+                        help_text="Dataset name for routing logs to dedicated data streams (default: generic.otel)",
+                        required=False,
+                        default_value="generic.otel"
+                    ),
+                    FormField(
+                        name="namespace",
+                        type=FieldType.TEXT,
+                        label="Data Stream Namespace (Optional)",
+                        placeholder="default",
+                        help_text="Namespace for data stream organization (default: default)",
+                        required=False,
+                        default_value="default"
                     )
                 ]
             )
diff --git a/application_src/configuration-api/app/models/ssm_data_models.py b/application_src/configuration-api/app/models/ssm_data_models.py
index 80b54fd..b4b32eb 100644
--- a/application_src/configuration-api/app/models/ssm_data_models.py
+++ b/application_src/configuration-api/app/models/ssm_data_models.py
@@ -34,6 +34,17 @@ class ProviderType(str, Enum):
     YES = "Yes"
     DEFAULT = "default"
     CUSTOM = "custom"
+    # Memory providers
+    MEM0 = "mem0"
+    ELASTICSEARCH = "elasticsearch"
+    BEDROCK_AGENTCORE = "bedrock_agentcore"
+    OPENSEARCH = "opensearch"
+    # Knowledge base providers
+    BEDROCK = "bedrock"
+    BEDROCK_KB = "bedrock_kb"
+    CUSTOM_KB = "custom_kb"
+    # Guardrail providers
+    BEDROCK_GUARDRAIL = "bedrock"
 
 
 class StreamingType(str, Enum):
@@ -94,6 +105,10 @@ class SSMAgentConfiguration(BaseModel):
         ..., 
         description="Primary model identifier with region prefix (e.g. us.anthropic.claude-3-5-sonnet-20241022-v2:0)"
     )
+    model_ids: Optional[List[str]] = Field(
+        default=None,
+        description="Multiple model identifiers for dynamic model switching (optional, for multi-model support)"
+    )
     judge_model_id: str = Field(
         ..., 
         description="Judge model identifier with region prefix for evaluation tasks"
@@ -506,7 +521,7 @@ def validate_agent_configuration(config_data: Dict[str, Any]) -> Dict[str, Any]:
             
             return {
                 "valid": True,
-                "model": validated_config.dict(),
+                "model": validated_config.model_dump(mode='json'),
                 "errors": [],
                 "warnings": []
             }
diff --git a/application_src/configuration-api/app/services/agent_config_service.py b/application_src/configuration-api/app/services/agent_config_service.py
index 6911820..7e70931 100644
--- a/application_src/configuration-api/app/services/agent_config_service.py
+++ b/application_src/configuration-api/app/services/agent_config_service.py
@@ -37,6 +37,10 @@ def save_agent_configuration(self, config_request: AgentConfigRequest) -> Dict[s
         """
         Save agent configuration including system prompt and settings.
         
+        CRITICAL: This method preserves existing configuration data when editing.
+        When updating an existing agent, it merges new data with existing data to prevent
+        loss of configurations that weren't modified in the current request.
+        
         Args:
             config_request: Agent configuration request data
             
@@ -65,9 +69,109 @@ def save_agent_configuration(self, config_request: AgentConfigRequest) -> Dict[s
                 if not success:
                     raise Exception("Failed to store system prompt")
 
-            # Prepare configuration data (excluding system prompt content)
-            config_data = config_request.dict()
-            config_data.pop('system_prompt', None)  # Remove prompt content from config
+            # CRITICAL FIX: Load existing configuration to preserve data during edits
+            existing_config = self._get_agent_config(agent_name)
+            
+            # Prepare new configuration data (excluding system prompt content)
+            # Use model_dump() for Pydantic v2 with proper nested model serialization
+            new_config_data = config_request.model_dump(mode='json', exclude_none=True)
+            new_config_data.pop('system_prompt', None)  # Remove prompt content from config
+            
+            # Normalize cache_prompt and cache_tools values to valid enum values
+            # UI may send 'default' which needs to be converted to 'False'
+            if new_config_data.get('cache_prompt') == 'default':
+                new_config_data['cache_prompt'] = 'False'
+                logger.info(f"Normalized cache_prompt from 'default' to 'False'")
+            
+            if new_config_data.get('cache_tools') == 'default':
+                new_config_data['cache_tools'] = 'False'
+                logger.info(f"Normalized cache_tools from 'default' to 'False'")
+
+            # CRITICAL FIX: Merge new data with existing data to preserve unmodified fields
+            if existing_config:
+                logger.info(f"Merging with existing configuration for agent: {agent_name}")
+                
+                # Start with existing config as base
+                merged_config = existing_config.copy()
+                
+                # Component types that have provider_details pattern
+                component_types = ['memory', 'knowledge_base', 'observability', 'guardrail']
+                
+                # Track which components have existing configurations
+                components_to_preserve = {}
+                for component_type in component_types:
+                    details_key = f"{component_type}_details" if component_type == 'knowledge_base' else f"{component_type}_provider_details"
+                    existing_details = existing_config.get(details_key, [])
+                    
+                    # Check if this component has meaningful existing configuration
+                    if isinstance(existing_details, list) and len(existing_details) > 0:
+                        # Check if any provider has non-empty config
+                        has_config = any(
+                            isinstance(provider, dict) and 
+                            isinstance(provider.get('config', {}), dict) and 
+                            len(provider.get('config', {})) > 0 
+                            for provider in existing_details
+                        )
+                        
+                        if has_config:
+                            components_to_preserve[component_type] = {
+                                'details_key': details_key,
+                                'enabled_key': component_type,
+                                'provider_key': f"{component_type}_provider",
+                                'existing_details': existing_details,
+                                'existing_enabled': existing_config.get(component_type, 'False'),
+                                'existing_provider': existing_config.get(f"{component_type}_provider", 'default')
+                            }
+                            logger.info(f"Component '{component_type}' has existing configuration to potentially preserve")
+                
+                # Update with new values
+                for key, value in new_config_data.items():
+                    # Check if this is a component field that should be preserved
+                    should_preserve = False
+                    
+                    for component_type, preserve_info in components_to_preserve.items():
+                        # Check if this key belongs to a component with existing config
+                        if key == preserve_info['details_key']:
+                            # If new value is empty list, preserve existing details
+                            if isinstance(value, list) and len(value) == 0:
+                                logger.info(f"Preserving existing {key} (has {len(preserve_info['existing_details'])} items)")
+                                should_preserve = True
+                                break
+                        
+                        # Also preserve the enabled flag if details are being preserved
+                        elif key == preserve_info['enabled_key']:
+                            new_details = new_config_data.get(preserve_info['details_key'], [])
+                            if isinstance(new_details, list) and len(new_details) == 0:
+                                # New request has empty details, check if we should preserve
+                                if value in ['False', 'false', False, 'No', 'default']:
+                                    logger.info(f"Preserving existing {key} enabled status: {preserve_info['existing_enabled']}")
+                                    merged_config[key] = preserve_info['existing_enabled']
+                                    should_preserve = True
+                                    break
+                        
+                        # Also preserve the provider name if details are being preserved
+                        elif key == preserve_info['provider_key']:
+                            new_details = new_config_data.get(preserve_info['details_key'], [])
+                            if isinstance(new_details, list) and len(new_details) == 0:
+                                if value in ['default', 'No', 'no']:
+                                    logger.info(f"Preserving existing {key}: {preserve_info['existing_provider']}")
+                                    merged_config[key] = preserve_info['existing_provider']
+                                    should_preserve = True
+                                    break
+                    
+                    # If we should preserve this field, skip the update
+                    if should_preserve:
+                        continue
+                    
+                    # Otherwise, update the field with new value
+                    merged_config[key] = value
+                
+                config_data = merged_config
+                logger.info(f"Configuration merged successfully, preserved {len(components_to_preserve)} component configurations")
+            else:
+                # New agent - use new config as-is
+                config_data = new_config_data
+                logger.info(f"Creating new configuration for agent: {agent_name}")
 
             # Store main configuration
             success = self._store_agent_config(agent_name, config_data)
@@ -112,10 +216,8 @@ def load_agent_configuration(self, agent_name: str) -> AgentConfigResponse:
             if config_data is None:
                 raise Exception(f"Agent '{agent_name}' not found")
 
-            logger.info(f"DEBUG CONFIG LOAD: Raw config data from SSM: {config_data}")
-            logger.info(f"DEBUG CONFIG LOAD: Data types in SSM config:")
-            for key, value in config_data.items():
-                logger.info(f"  {key}: {type(value).__name__} = {value}")
+            logger.debug("Raw config data loaded from SSM")
+            logger.debug(f"Data types in SSM config: {list(config_data.keys())}")
 
             # Retrieve system prompt if specified
             system_prompt_name = config_data.get('system_prompt_name', '')
@@ -136,9 +238,116 @@ def load_agent_configuration(self, agent_name: str) -> AgentConfigResponse:
             if 'mcp_servers' not in config_data:
                 config_data['mcp_servers'] = ""
 
-            logger.info(f"DEBUG CONFIG LOAD: Final config data before Pydantic model:")
-            for key, value in config_data.items():
-                logger.info(f"  {key}: {type(value).__name__} = {value}")
+            # Convert nested dictionaries to Pydantic models
+            # This ensures proper deserialization of complex nested structures
+            from ..models.agent_config import ThinkingConfig, ProviderConfig, ToolConfig
+            
+            # Helper function to safely convert nested structures
+            def convert_to_provider_config(item):
+                """Safely convert item to ProviderConfig, handling various input types."""
+                if isinstance(item, dict):
+                    return ProviderConfig(**item)
+                elif isinstance(item, ProviderConfig):
+                    return item
+                else:
+                    logger.warning(f"Unexpected provider config type: {type(item)}, item: {item}")
+                    # Try to handle string or other types gracefully
+                    if isinstance(item, str):
+                        return ProviderConfig(name=item, config={})
+                    return item
+            
+            def convert_to_tool_config(item):
+                """Safely convert item to ToolConfig, handling various input types."""
+                if isinstance(item, dict):
+                    return ToolConfig(**item)
+                elif isinstance(item, ToolConfig):
+                    return item
+                else:
+                    logger.warning(f"Unexpected tool config type: {type(item)}, item: {item}")
+                    if isinstance(item, str):
+                        return ToolConfig(name=item, config={})
+                    return item
+            
+            # Convert thinking config - handle missing or invalid data
+            if 'thinking' in config_data:
+                if isinstance(config_data['thinking'], dict):
+                    config_data['thinking'] = ThinkingConfig(**config_data['thinking'])
+                elif not isinstance(config_data['thinking'], ThinkingConfig):
+                    logger.warning(f"Invalid thinking config type: {type(config_data['thinking'])}, using default")
+                    config_data['thinking'] = ThinkingConfig(type="standard", budget_tokens=100000)
+            else:
+                config_data['thinking'] = ThinkingConfig(type="standard", budget_tokens=100000)
+            
+            # Convert tools list - handle empty, None, or invalid data
+            if 'tools' in config_data:
+                if isinstance(config_data['tools'], list):
+                    config_data['tools'] = [convert_to_tool_config(tool) for tool in config_data['tools']]
+                elif config_data['tools'] is None:
+                    config_data['tools'] = []
+                else:
+                    logger.warning(f"Invalid tools type: {type(config_data['tools'])}, using empty list")
+                    config_data['tools'] = []
+            else:
+                config_data['tools'] = []
+            
+            # Convert memory provider details - handle all cases
+            if 'memory_provider_details' in config_data:
+                if isinstance(config_data['memory_provider_details'], list):
+                    config_data['memory_provider_details'] = [
+                        convert_to_provider_config(provider) for provider in config_data['memory_provider_details']
+                    ]
+                elif config_data['memory_provider_details'] is None:
+                    config_data['memory_provider_details'] = []
+                else:
+                    logger.warning(f"Invalid memory_provider_details type: {type(config_data['memory_provider_details'])}")
+                    config_data['memory_provider_details'] = []
+            else:
+                config_data['memory_provider_details'] = []
+            
+            # Convert knowledge base details - handle all cases
+            if 'knowledge_base_details' in config_data:
+                if isinstance(config_data['knowledge_base_details'], list):
+                    config_data['knowledge_base_details'] = [
+                        convert_to_provider_config(provider) for provider in config_data['knowledge_base_details']
+                    ]
+                elif config_data['knowledge_base_details'] is None:
+                    config_data['knowledge_base_details'] = []
+                else:
+                    logger.warning(f"Invalid knowledge_base_details type: {type(config_data['knowledge_base_details'])}")
+                    config_data['knowledge_base_details'] = []
+            else:
+                config_data['knowledge_base_details'] = []
+            
+            # Convert observability provider details - handle all cases
+            if 'observability_provider_details' in config_data:
+                if isinstance(config_data['observability_provider_details'], list):
+                    config_data['observability_provider_details'] = [
+                        convert_to_provider_config(provider) for provider in config_data['observability_provider_details']
+                    ]
+                elif config_data['observability_provider_details'] is None:
+                    config_data['observability_provider_details'] = []
+                else:
+                    logger.warning(f"Invalid observability_provider_details type: {type(config_data['observability_provider_details'])}")
+                    config_data['observability_provider_details'] = []
+            else:
+                config_data['observability_provider_details'] = []
+            
+            # Convert guardrail provider details - handle all cases
+            if 'guardrail_provider_details' in config_data:
+                if isinstance(config_data['guardrail_provider_details'], list):
+                    config_data['guardrail_provider_details'] = [
+                        convert_to_provider_config(provider) for provider in config_data['guardrail_provider_details']
+                    ]
+                elif config_data['guardrail_provider_details'] is None:
+                    config_data['guardrail_provider_details'] = []
+                else:
+                    logger.warning(f"Invalid guardrail_provider_details type: {type(config_data['guardrail_provider_details'])}")
+                    config_data['guardrail_provider_details'] = []
+            else:
+                config_data['guardrail_provider_details'] = []
+
+            logger.debug("Final config data converted to Pydantic models")
+            logger.debug(f"Available config fields: {list(config_data.keys())}")
 
             logger.info(f"Successfully loaded configuration for agent: {agent_name}")
             return AgentConfigResponse(**config_data)
@@ -461,7 +670,7 @@ def _store_agent_config(self, agent_name: str, config_data: Dict) -> bool:
                 # Don't fail silently - log the error but still attempt to store for backward compatibility
                 logger.warning(f"Storing potentially incomplete configuration for {agent_name}")
             else:
-                logger.info(f"✅ Configuration validation passed for {agent_name} - conforms to SSM data model")
+                logger.info(f"Configuration validation passed for {agent_name} - conforms to SSM data model")
             
             # Use standardized path from SSM parameter paths
             config_path = SSMParameterPaths.agent_config(agent_name)
@@ -877,7 +1086,8 @@ def update_agent_tools(self, tools_request: AgentToolsUpdateRequest) -> Dict[str
                 raise Exception(f"Agent '{agent_name}' configuration not found")
 
             # Update only the tools field
-            current_config['tools'] = [tool.dict() for tool in tools_request.tools]
+            # Use model_dump() for Pydantic v2 with proper nested model serialization
+            current_config['tools'] = [tool.model_dump(mode='json') for tool in tools_request.tools]
 
             # Store updated configuration
             success = self._store_agent_config(agent_name, current_config)
diff --git a/application_src/configuration-api/app/services/cloudformation_deployment_service.py b/application_src/configuration-api/app/services/cloudformation_deployment_service.py
deleted file mode 100644
index e08d15f..0000000
--- a/application_src/configuration-api/app/services/cloudformation_deployment_service.py
+++ /dev/null
@@ -1,514 +0,0 @@
-"""
-CloudFormation Deployment Service
-
-Manages dynamic agent stack deployment using CloudFormation API directly.
-Replaces subprocess-based CDK deployment with native CloudFormation operations.
-
-Key Features:
-- Downloads CloudFormation templates from S3
-- Creates/updates/deletes agent stacks with proper tagging
-- Monitors deployment status with comprehensive error handling
-- Provides deployment metadata and outputs
-"""
-
-import logging
-import time
-import json
-from typing import Any
-from datetime import datetime, timezone
-
-import sys
-import os
-sys.path.append(os.path.join(os.path.dirname(__file__), '../../../../'))
-from common.secure_logging_utils import log_exception_safely
-
-import boto3
-from botocore.exceptions import ClientError
-
-logger = logging.getLogger(__name__)
-
-
-class CloudFormationDeploymentService:
-    """Service for managing CloudFormation stack deployments."""
-    
-    def __init__(
-        self,
-        region: str,
-        project_name: str,
-        template_bucket: str
-    ):
-        """
-        Initialize CloudFormation deployment service.
-        
-        Args:
-            region: AWS region for deployments
-            project_name: Project name for stack naming
-            template_bucket: S3 bucket containing CloudFormation templates
-        """
-        self.region = region
-        self.project_name = project_name
-        self.template_bucket = template_bucket
-        
-        # Initialize AWS clients
-        self.cfn_client = boto3.client('cloudformation', region_name=region)
-        self.s3_client = boto3.client('s3', region_name=region)
-        
-        logger.info(
-            f"Initialized CloudFormation service: region={region}, "
-            f"project={project_name}, bucket={template_bucket}"
-        )
-    
-    def deploy_agent_stack(
-        self,
-        agent_name: str,
-        parameters: dict[str, Any],
-        timeout_minutes: int = 30
-    ) -> dict[str, Any]:
-        """
-        Deploy a new agent stack using CloudFormation.
-        
-        Args:
-            agent_name: Name of the agent
-            parameters: CloudFormation parameters for the stack
-            timeout_minutes: Maximum time to wait for deployment
-            
-        Returns:
-            Deployment result with stack outputs and metadata
-            
-        Raises:
-            RuntimeError: If deployment fails
-        """
-        stack_name = self._get_stack_name(agent_name)
-        template_key = "GenericAgentTemplate.json"
-        
-        logger.info(f"Starting deployment of agent stack: {stack_name}")
-        
-        try:
-            # Download template from S3
-            template_body = self._download_template(template_key)
-            
-            # Prepare CloudFormation parameters
-            cfn_parameters = self._convert_parameters(parameters)
-            
-            # Create stack with proper tags
-            self.cfn_client.create_stack(
-                StackName=stack_name,
-                TemplateBody=template_body,
-                Parameters=cfn_parameters,
-                Capabilities=['CAPABILITY_IAM', 'CAPABILITY_NAMED_IAM'],
-                Tags=[
-                    {'Key': 'ManagedBy', 'Value': 'ConfigurationAPI'},
-                    {'Key': 'ProjectName', 'Value': self.project_name},
-                    {'Key': 'AgentName', 'Value': agent_name},
-                    {'Key': 'DeployedAt', 'Value': datetime.now(timezone.utc).isoformat()}
-                ],
-                TimeoutInMinutes=timeout_minutes
-            )
-            
-            logger.info(f"CloudFormation stack creation initiated: {stack_name}")
-            
-            # Wait for stack creation to complete
-            result = self._wait_for_stack_complete(stack_name, 'CREATE_COMPLETE', timeout_minutes)
-            
-            logger.info(f"Agent stack deployed successfully: {stack_name}")
-            return result
-            
-        except ClientError as e:
-            log_exception_safely(logger, e, f"Failed to deploy agent stack {stack_name}")
-            raise RuntimeError(f"Failed to deploy agent stack {stack_name}") from e
-    
-    def update_agent_stack(
-        self,
-        agent_name: str,
-        parameters: dict[str, Any],
-        timeout_minutes: int = 30
-    ) -> dict[str, Any]:
-        """
-        Update an existing agent stack.
-        
-        Args:
-            agent_name: Name of the agent
-            parameters: CloudFormation parameters for the stack
-            timeout_minutes: Maximum time to wait for update
-            
-        Returns:
-            Update result with stack outputs and metadata
-            
-        Raises:
-            RuntimeError: If update fails
-        """
-        stack_name = self._get_stack_name(agent_name)
-        template_key = "GenericAgentTemplate.json"
-        
-        logger.info(f"Starting update of agent stack: {stack_name}")
-        
-        try:
-            # Download template from S3
-            template_body = self._download_template(template_key)
-            
-            # Prepare CloudFormation parameters
-            cfn_parameters = self._convert_parameters(parameters)
-            
-            # Update stack
-            self.cfn_client.update_stack(
-                StackName=stack_name,
-                TemplateBody=template_body,
-                Parameters=cfn_parameters,
-                Capabilities=['CAPABILITY_IAM', 'CAPABILITY_NAMED_IAM']
-            )
-            
-            logger.info(f"CloudFormation stack update initiated: {stack_name}")
-            
-            # Wait for stack update to complete
-            result = self._wait_for_stack_complete(stack_name, 'UPDATE_COMPLETE', timeout_minutes)
-            
-            logger.info(f"Agent stack updated successfully: {stack_name}")
-            return result
-            
-        except ClientError as e:
-            if 'No updates are to be performed' in str(e):
-                logger.info(f"No updates needed for stack: {stack_name}")
-                return self.get_stack_info(agent_name)
-            
-            log_exception_safely(logger, e, f"Failed to update agent stack {stack_name}")
-            raise RuntimeError(f"Failed to update agent stack {stack_name}") from e
-    
-    def delete_agent_stack(
-        self,
-        agent_name: str,
-        timeout_minutes: int = 30
-    ) -> dict[str, Any]:
-        """
-        Delete an agent stack.
-        
-        Args:
-            agent_name: Name of the agent
-            timeout_minutes: Maximum time to wait for deletion
-            
-        Returns:
-            Deletion result with metadata
-            
-        Raises:
-            RuntimeError: If deletion fails
-        """
-        stack_name = self._get_stack_name(agent_name)
-        
-        logger.info(f"Starting deletion of agent stack: {stack_name}")
-        
-        try:
-            # Delete stack
-            self.cfn_client.delete_stack(StackName=stack_name)
-            
-            logger.info(f"CloudFormation stack deletion initiated: {stack_name}")
-            
-            # Wait for stack deletion to complete
-            self._wait_for_stack_delete(stack_name, timeout_minutes)
-            
-            logger.info(f"Agent stack deleted successfully: {stack_name}")
-            
-            return {
-                'stack_name': stack_name,
-                'agent_name': agent_name,
-                'status': 'DELETE_COMPLETE',
-                'deleted_at': datetime.now(timezone.utc).isoformat()
-            }
-            
-        except ClientError as e:
-            log_exception_safely(logger, e, f"Failed to delete agent stack {stack_name}")
-            raise RuntimeError(f"Failed to delete agent stack {stack_name}") from e
-    
-    def get_stack_info(self, agent_name: str) -> dict[str, Any]:
-        """
-        Get information about an agent stack.
-        
-        Args:
-            agent_name: Name of the agent
-            
-        Returns:
-            Stack information with outputs and metadata
-            
-        Raises:
-            RuntimeError: If stack not found or error occurs
-        """
-        stack_name = self._get_stack_name(agent_name)
-        
-        try:
-            response = self.cfn_client.describe_stacks(StackName=stack_name)
-            
-            if not response.get('Stacks'):
-                raise RuntimeError(f"Stack not found: {stack_name}")
-            
-            stack = response['Stacks'][0]
-            
-            return {
-                'stack_name': stack_name,
-                'agent_name': agent_name,
-                'stack_id': stack.get('StackId'),
-                'status': stack.get('StackStatus'),
-                'creation_time': stack.get('CreationTime').isoformat() if stack.get('CreationTime') else None,
-                'last_updated_time': stack.get('LastUpdatedTime').isoformat() if stack.get('LastUpdatedTime') else None,
-                'outputs': self._parse_stack_outputs(stack.get('Outputs', []))
-            }
-            
-        except ClientError as e:
-            if e.response['Error']['Code'] == 'ValidationError':
-                raise RuntimeError(f"Stack not found: {stack_name}") from e
-            
-            log_exception_safely(logger, e, f"Failed to get stack info for {stack_name}")
-            raise RuntimeError(f"Failed to get stack info for {stack_name}") from e
-    
-    def list_agent_stacks(self) -> list[dict[str, Any]]:
-        """
-        List all agent stacks managed by this service.
-        
-        Returns:
-            List of stack summaries
-        """
-        try:
-            stacks = []
-            paginator = self.cfn_client.get_paginator('list_stacks')
-            
-            for page in paginator.paginate(
-                StackStatusFilter=[
-                    'CREATE_COMPLETE',
-                    'UPDATE_COMPLETE',
-                    'UPDATE_ROLLBACK_COMPLETE'
-                ]
-            ):
-                for stack in page['StackSummaries']:
-                    stack_name = stack['StackName']
-                    
-                    # Only include stacks managed by this project
-                    if stack_name.startswith(f"{self.project_name}-agent-"):
-                        stacks.append({
-                            'stack_name': stack_name,
-                            'agent_name': self._extract_agent_name(stack_name),
-                            'status': stack['StackStatus'],
-                            'creation_time': stack['CreationTime'].isoformat() if stack.get('CreationTime') else None,
-                            'last_updated_time': stack.get('LastUpdatedTime').isoformat() if stack.get('LastUpdatedTime') else None
-                        })
-            
-            return stacks
-            
-        except ClientError as e:
-            log_exception_safely(logger, e, "Failed to list agent stacks")
-            raise RuntimeError("Failed to list agent stacks") from e
-    
-    def _get_stack_name(self, agent_name: str) -> str:
-        """Generate CloudFormation stack name for agent."""
-        return f"{self.project_name}-agent-{agent_name}"
-    
-    def _extract_agent_name(self, stack_name: str) -> str:
-        """Extract agent name from stack name."""
-        prefix = f"{self.project_name}-agent-"
-        if stack_name.startswith(prefix):
-            return stack_name[len(prefix):]
-        return stack_name
-    
-    def _download_template(self, template_key: str) -> str:
-        """
-        Download CloudFormation template from S3.
-        
-        Args:
-            template_key: S3 object key for template
-            
-        Returns:
-            Template body as string
-        """
-        try:
-            response = self.s3_client.get_object(
-                Bucket=self.template_bucket,
-                Key=template_key
-            )
-            
-            template_body = response['Body'].read().decode('utf-8')
-            logger.info(f"Downloaded template: {template_key}")
-            
-            return template_body
-            
-        except ClientError as e:
-            log_exception_safely(logger, e, f"Failed to download template {template_key}")
-            raise RuntimeError(f"Failed to download template {template_key}") from e
-    
-    def _convert_parameters(self, parameters: dict[str, Any]) -> list[dict[str, str]]:
-        """
-        Convert dictionary parameters to CloudFormation format.
-        
-        Args:
-            parameters: Dictionary of parameter name -> value
-            
-        Returns:
-            List of CloudFormation parameter dicts
-        """
-        cfn_parameters = []
-        
-        for key, value in parameters.items():
-            cfn_parameters.append({
-                'ParameterKey': key,
-                'ParameterValue': str(value)
-            })
-        
-        return cfn_parameters
-    
-    def _parse_stack_outputs(self, outputs: list[dict[str, str]]) -> dict[str, str]:
-        """
-        Parse CloudFormation stack outputs into a dictionary.
-        
-        Args:
-            outputs: List of output dicts from CloudFormation
-            
-        Returns:
-            Dictionary of output key -> value
-        """
-        return {
-            output['OutputKey']: output['OutputValue']
-            for output in outputs
-        }
-    
-    def _wait_for_stack_complete(
-        self,
-        stack_name: str,
-        expected_status: str,
-        timeout_minutes: int
-    ) -> dict[str, Any]:
-        """
-        Wait for stack operation to complete.
-        
-        Args:
-            stack_name: Name of the stack
-            expected_status: Expected completion status
-            timeout_minutes: Maximum time to wait
-            
-        Returns:
-            Stack information after completion
-            
-        Raises:
-            RuntimeError: If operation fails or times out
-        """
-        start_time = time.time()
-        timeout_seconds = timeout_minutes * 60
-        check_interval = 10  # seconds
-        
-        while True:
-            elapsed = time.time() - start_time
-            
-            if elapsed > timeout_seconds:
-                raise RuntimeError(
-                    f"Stack operation timed out after {timeout_minutes} minutes: {stack_name}"
-                )
-            
-            try:
-                response = self.cfn_client.describe_stacks(StackName=stack_name)
-                
-                if not response.get('Stacks'):
-                    raise RuntimeError(f"Stack not found: {stack_name}")
-                
-                stack = response['Stacks'][0]
-                status = stack['StackStatus']
-                
-                logger.info(f"Stack {stack_name} status: {status}")
-                
-                # Check if completed successfully
-                if status == expected_status:
-                    return {
-                        'stack_name': stack_name,
-                        'agent_name': self._extract_agent_name(stack_name),
-                        'status': status,
-                        'outputs': self._parse_stack_outputs(stack.get('Outputs', []))
-                    }
-                
-                # Check for failure states (including ROLLBACK_COMPLETE)
-                if (status.endswith('_FAILED') or 
-                    status == 'ROLLBACK_COMPLETE' or 
-                    status == 'ROLLBACK_FAILED' or
-                    status == 'DELETE_FAILED'):
-                    error_msg = self._get_stack_error_reason(stack_name)
-                    logger.error(f"Stack {stack_name} failed with status {status}: {error_msg}")
-                    raise RuntimeError(
-                        f"Stack operation failed with status {status}: {error_msg}"
-                    )
-                
-                # Still in progress, wait before checking again
-                time.sleep(check_interval)
-                
-            except ClientError as e:
-                log_exception_safely(logger, e, "Error checking stack status")
-                raise RuntimeError("Error checking stack status") from e
-    
-    def _wait_for_stack_delete(self, stack_name: str, timeout_minutes: int) -> None:
-        """
-        Wait for stack deletion to complete.
-        
-        Args:
-            stack_name: Name of the stack
-            timeout_minutes: Maximum time to wait
-            
-        Raises:
-            RuntimeError: If deletion fails or times out
-        """
-        start_time = time.time()
-        timeout_seconds = timeout_minutes * 60
-        check_interval = 10  # seconds
-        
-        while True:
-            elapsed = time.time() - start_time
-            
-            if elapsed > timeout_seconds:
-                raise RuntimeError(
-                    f"Stack deletion timed out after {timeout_minutes} minutes: {stack_name}"
-                )
-            
-            try:
-                response = self.cfn_client.describe_stacks(StackName=stack_name)
-                
-                if not response.get('Stacks'):
-                    # Stack no longer exists - deletion complete
-                    logger.info(f"Stack deleted successfully: {stack_name}")
-                    return
-                
-                stack = response['Stacks'][0]
-                status = stack['StackStatus']
-                
-                logger.info(f"Stack {stack_name} deletion status: {status}")
-                
-                # Check for failure states
-                if status == 'DELETE_FAILED':
-                    error_msg = self._get_stack_error_reason(stack_name)
-                    raise RuntimeError(
-                        f"Stack deletion failed: {error_msg}"
-                    )
-                
-                # Still deleting, wait before checking again
-                time.sleep(check_interval)
-                
-            except ClientError as e:
-                if e.response['Error']['Code'] == 'ValidationError':
-                    # Stack no longer exists - deletion complete
-                    logger.info(f"Stack deleted successfully: {stack_name}")
-                    return
-                
-                log_exception_safely(logger, e, "Error checking stack deletion status")
-                raise RuntimeError("Error checking stack deletion status") from e
-    
-    def _get_stack_error_reason(self, stack_name: str) -> str:
-        """
-        Get detailed error reason for stack operation failure.
-        
-        Args:
-            stack_name: Name of the stack
-            
-        Returns:
-            Error message describing the failure
-        """
-        try:
-            response = self.cfn_client.describe_stack_events(StackName=stack_name)
-            
-            # Find failed events
-            for event in response['StackEvents']:
-                if event['ResourceStatus'].endswith('_FAILED'):
-                    return event.get('ResourceStatusReason', 'Unknown error')
-            
-            return "No detailed error information available"
-            
-        except ClientError:
-            return "Failed to retrieve error details"
diff --git a/application_src/configuration-api/app/services/deployment_service.py b/application_src/configuration-api/app/services/deployment_service.py
index d694e81..9eb3bd3 100644
--- a/application_src/configuration-api/app/services/deployment_service.py
+++ b/application_src/configuration-api/app/services/deployment_service.py
@@ -39,9 +39,16 @@ def __init__(self):
             # Get region from environment variable with fallback
             self.region = os.environ.get('AWS_REGION', os.environ.get('AWS_DEFAULT_REGION', 'us-east-1'))
             self.cloudformation = boto3.client('cloudformation', region_name=self.region)
-            # Get project name from environment or use default
-            self.project_name = os.environ.get('PROJECT_NAME', 'genai-box')
+            self.s3 = boto3.client('s3', region_name=self.region)
+            
+            # Get project name and account for S3 bucket construction
+            self.project_name = os.environ.get('PROJECT_NAME', 'ai-platform')
+            sts = boto3.client('sts')
+            self.account_id = sts.get_caller_identity()['Account']
+            self.template_bucket_name = f"{self.project_name}-templates-{self.account_id}-{self.region}"
+            
             logger.info(f"DeploymentService initialized for region: {self.region}, project: {self.project_name}")
+            logger.info(f"Template bucket: {self.template_bucket_name}")
         except NoCredentialsError:
             logger.error("AWS credentials not found")
             raise ValueError("AWS credentials not configured")
@@ -60,16 +67,14 @@ async def get_project_name(self) -> str:
     
     async def create_agent_stack(
         self,
-        source_stack_name: str,
         new_agent_name: str,
         new_stack_name: str,
         model_config: Optional[Dict[str, Any]] = None
     ) -> Dict[str, Any]:
         """
-        Create a new agent stack from an existing template.
+        Create a new agent stack using the template from S3.
         
         Args:
-            source_stack_name: Name of the existing stack to copy template from
             new_agent_name: Name for the new agent (AgentName parameter)
             new_stack_name: Name for the new CloudFormation stack
             model_config: Model configuration (if not provided, will read from SSM)
@@ -78,58 +83,119 @@ async def create_agent_stack(
             Dictionary containing stack creation information
             
         Raises:
-            ValueError: If source stack not found or validation fails
+            ValueError: If template not found or validation fails
             Exception: If stack creation fails
         """
         try:
-            logger.info(f"Creating agent stack '{new_stack_name}' from source '{source_stack_name}'")
+            logger.info(f"Creating agent stack '{new_stack_name}' for agent '{new_agent_name}'")
             
             # Simplified approach: Agents read all configuration from SSM
             # No need to inject environment variables - cleaner and more reliable
             
-            # Get the template from the source stack
-            template_body = await self._get_stack_template(source_stack_name)
+            # Get the template from S3 (deployed by CDK template-storage stack)
+            template_body = await self._get_template_from_s3("GenericAgentTemplate.json")
             
-            # Only modify the AgentName parameter - agents read everything else from SSM
-            modified_template = self._modify_agent_name_parameter(template_body, new_agent_name)
+            # No template modification needed - parameters are explicitly required
+            # AgentName and ImageTag must be provided as CloudFormation parameters
             
             logger.info(f"Agent {new_agent_name} will read all configuration from SSM parameter: /agent/{new_agent_name}/config")
             
-            # Get the original stack's tags and capabilities
-            source_stack_info = await self._get_stack_info(source_stack_name)
-            
             # Prepare parameters for the new stack
+            logger.debug(f"Building CloudFormation parameters for agent: {new_agent_name}")
+            
             parameters = [
                 {
                     'ParameterKey': 'AgentName',
                     'ParameterValue': new_agent_name
                 }
             ]
+            logger.debug(f"Set AgentName parameter: {new_agent_name}")
+            
+            # CRITICAL: Retrieve and set ImageTag from SSM to ensure correct image version
+            # ImageTag parameter is required - no default value in template
+            logger.info("IMAGE TAG RETRIEVAL FOR AGENT CREATION")
+            
+            image_uri = None
+            try:
+                logger.info(f"Retrieving image URI from SSM Parameter Store: /{self.project_name}/agent/image-uri")
+                
+                image_uri = self._get_image_uri_from_ssm()
+                
+                logger.info(f"Successfully retrieved image URI from SSM: {image_uri}")
+                
+                # Extract and log the tag portion
+                if ':' in image_uri:
+                    tag_portion = image_uri.split(':')[-1]
+                    logger.debug(f"Extracted tag: {tag_portion}")
+                    
+                    # Validate that we don't have "latest" in the image URI
+                    if tag_portion.lower() == 'latest':
+                        logger.error("SSM parameter contains 'latest' tag!")
+                        logger.error("This indicates the ECR image was not properly tagged with SHA256")
+                        logger.error("The CDK deployment may not have completed successfully")
+                        raise ValueError("SSM parameter contains 'latest' tag instead of SHA256 hash")
+                else:
+                    logger.error("No ':' found in image URI from SSM")
+                    logger.error(f"Invalid image URI format: {image_uri}")
+                    raise ValueError("Invalid image URI format - missing tag separator")
+                
+                # IMPORTANT: Pass the FULL image URI, not just the tag
+                # The CloudFormation template expects the complete URI with repository and SHA256 tag
+                parameters.append({
+                    'ParameterKey': 'ImageTag',
+                    'ParameterValue': image_uri
+                })
+                
+                logger.info("Successfully added ImageTag to CloudFormation parameters")
+                logger.debug(f"CloudFormation ImageTag parameter value: {image_uri}")
+                
+            except Exception as e:
+                logger.error(f"Failed to retrieve ImageTag from SSM: {e}")
+                logger.error(f"Error type: {type(e).__name__}")
+                
+                # CRITICAL: Do not proceed without a valid ImageTag - this would cause "latest" to be used
+                if image_uri and 'latest' in image_uri.lower():
+                    logger.error("Refusing to create agent with 'latest' image tag!")
+                    logger.error("This would create an unstable deployment")
+                    raise ValueError("Cannot create agent with 'latest' image tag - please ensure CDK deployment completed successfully")
+                
+                logger.error("Refusing to create agent WITHOUT ImageTag parameter")
+                logger.error("ImageTag parameter is required - no default value in template")
+                logger.error("Agent creation aborted to prevent deployment failure!")
+                
+                # Re-raise the exception to prevent agent creation with wrong image
+                raise ValueError(f"ImageTag retrieval failed: {e}. Cannot create agent without proper image tag.")
+            
+            # Log final parameters before CloudFormation call
+            logger.info("Final CloudFormation parameters for create_stack")
+            logger.info(f"Stack Name: {new_stack_name}")
+            logger.info(f"Total parameters: {len(parameters)}")
+            for i, param in enumerate(parameters, 1):
+                logger.debug(f"  {i}. {param['ParameterKey']} = {param['ParameterValue'][:100]}...")  # Truncate long values
             
             # Create the new stack
             create_params = {
                 'StackName': new_stack_name,
-                'TemplateBody': json.dumps(modified_template),
+                'TemplateBody': json.dumps(template_body),
                 'Parameters': parameters,
                 'Capabilities': ['CAPABILITY_IAM', 'CAPABILITY_NAMED_IAM'],
                 'Tags': [
                     {'Key': 'ManagedBy', 'Value': 'ConfigurationAPI'},
                     {'Key': 'AgentName', 'Value': new_agent_name},
-                    {'Key': 'SourceStack', 'Value': source_stack_name},
+                    {'Key': 'TemplateSource', 'Value': 'S3'},
                     {'Key': 'CreatedAt', 'Value': datetime.utcnow().isoformat()}
                 ]
             }
             
-            # Add original tags (excluding system tags)
-            if source_stack_info.get('tags'):
-                for tag in source_stack_info['tags']:
-                    if not tag['Key'].startswith('aws:'):
-                        create_params['Tags'].append(tag)
-            
+            logger.info("Calling CloudFormation CreateStack API...")
             response = self.cloudformation.create_stack(**create_params)
             
             stack_id = response['StackId']
-            logger.info(f"Successfully created stack '{new_stack_name}' with ID: {stack_id}")
+            logger.info("Stack creation initiated successfully")
+            logger.info(f"Stack Name: {new_stack_name}")
+            logger.info(f"Stack ID: {stack_id}")
+            logger.info(f"Agent Name: {new_agent_name}")
+            logger.info(f"Status: CREATE_IN_PROGRESS")
             
             return {
                 'stack_name': new_stack_name,
@@ -183,33 +249,48 @@ async def _get_stack_template(self, stack_name: str) -> Dict[str, Any]:
             log_exception_safely(logger, e, "Error getting stack template")
             raise
     
-    def _modify_agent_name_parameter(self, template: Dict[str, Any], new_agent_name: str) -> Dict[str, Any]:
+    async def _get_template_from_s3(self, template_key: str = "GenericAgentTemplate.json") -> Dict[str, Any]:
         """
-        Modify the AgentName parameter default value in the template.
+        Get the latest CloudFormation template from S3.
+        
+        This fetches the template that was deployed by CDK to the template storage bucket.
+        This ensures stack updates pick up the latest CDK-generated template changes.
         
         Args:
-            template: Original CloudFormation template
-            new_agent_name: New agent name to set as default
+            template_key: S3 key for the template file
             
         Returns:
-            Modified CloudFormation template
+            CloudFormation template as a dictionary
+            
+        Raises:
+            ValueError: If template not found in S3
+            Exception: If unable to retrieve template
         """
         try:
-            # Create a copy of the template to avoid modifying the original
-            modified_template = json.loads(json.dumps(template))
+            logger.info(f"Retrieving template from S3: s3://{self.template_bucket_name}/{template_key}")
             
-            # Update the AgentName parameter default value
-            if 'Parameters' in modified_template and 'AgentName' in modified_template['Parameters']:
-                modified_template['Parameters']['AgentName']['Default'] = new_agent_name
-                logger.info(f"Updated AgentName parameter default to: {new_agent_name}")
-            else:
-                logger.warning("AgentName parameter not found in template")
+            response = self.s3.get_object(
+                Bucket=self.template_bucket_name,
+                Key=template_key
+            )
             
-            return modified_template
+            template_body = response['Body'].read().decode('utf-8')
+            template = json.loads(template_body)
             
+            logger.info(f"Successfully retrieved template from S3: {template_key}")
+            return template
+            
+        except ClientError as e:
+            error_code = e.response['Error']['Code']
+            if error_code in ['NoSuchKey', 'NoSuchBucket']:
+                raise ValueError(f"Template '{template_key}' not found in S3 bucket '{self.template_bucket_name}'")
+            else:
+                logger.error(f"S3 error retrieving template: {error_code}")
+                raise Exception(f"Failed to retrieve template from S3: {e.response['Error']['Message']}")
         except Exception as e:
-            log_exception_safely(logger, e, "Error modifying template")
-            raise RuntimeError("Failed to modify template")
+            log_exception_safely(logger, e, "Error getting template from S3")
+            raise
+    
     
     def _inject_model_environment_variables(
         self, 
@@ -354,14 +435,22 @@ async def get_stack_status(self, stack_name: str) -> Dict[str, Any]:
             for output in stack_info.get('outputs', []):
                 outputs[output['OutputKey']] = output['OutputValue']
             
+            # Convert datetime objects to ISO format strings for JSON serialization
+            creation_time = None
+            if stack_info.get('creation_time'):
+                creation_time = stack_info['creation_time'].isoformat() if hasattr(stack_info['creation_time'], 'isoformat') else str(stack_info['creation_time'])
+            
+            last_updated_time = None
+            if stack_info.get('last_updated_time'):
+                last_updated_time = stack_info['last_updated_time'].isoformat() if hasattr(stack_info['last_updated_time'], 'isoformat') else str(stack_info['last_updated_time'])
+            
             return {
                 'stack_name': stack_info['stack_name'],
                 'stack_id': stack_info['stack_id'],
                 'status': stack_info['status'],
-                'agent_name': agent_name,
-                'created_at': stack_info.get('creation_time'),
-                'updated_at': stack_info.get('last_updated_time'),
-                'outputs': outputs if outputs else None
+                'creation_time': creation_time,
+                'last_updated_time': last_updated_time,
+                'outputs': outputs if outputs else {}
             }
             
         except Exception as e:
@@ -451,14 +540,25 @@ async def list_agent_stacks(self) -> List[Dict[str, Any]]:
             log_exception_safely(logger, e, "Error listing agent stacks")
             raise
     
-    async def find_agent_stack_by_name(self, agent_name: str) -> Optional[Dict[str, Any]]:
+    def get_stack_name_from_agent(self, agent_name: str) -> str:
         """
-        Find a CloudFormation stack for a specific agent using multiple naming patterns.
+        Get CloudFormation stack name from agent name using consistent naming pattern.
         
-        This method checks multiple possible naming patterns:
-        1. ai-platform-agent-{agent-name} (actual pattern used by the system)
-        2. ai-platform-{agent-name}-stack (legacy pattern)
-        3. {project-name}-{agent-name}-stack (fallback pattern)
+        Standard pattern: {project_name}-agent-{agent_name}
+        
+        Args:
+            agent_name: Name of the agent
+            
+        Returns:
+            CloudFormation stack name
+        """
+        # Convert underscores to hyphens for stack naming consistency
+        stack_agent_name = agent_name.replace('_', '-')
+        return f"{self.project_name}-agent-{stack_agent_name}"
+    
+    async def find_agent_stack_by_name(self, agent_name: str) -> Optional[Dict[str, Any]]:
+        """
+        Find a CloudFormation stack for a specific agent using standard naming pattern.
         
         Args:
             agent_name: Name of the agent to find stack for
@@ -467,70 +567,272 @@ async def find_agent_stack_by_name(self, agent_name: str) -> Optional[Dict[str,
             Stack information if found, None otherwise
         """
         try:
-            # Convert agent name to the expected stack format
-            stack_agent_name = agent_name.replace('_', '-')
-            
-            # Try multiple naming patterns
-            possible_stack_names = [
-                f"ai-platform-agent-{stack_agent_name}",  # Actual pattern from AWS CLI output
-                f"{self.project_name}-agent-{stack_agent_name}",  # Project-based agent pattern
-                f"ai-platform-{stack_agent_name}-stack",  # Original expected pattern
-                f"{self.project_name}-{stack_agent_name}-stack"  # Project-based legacy pattern
-            ]
+            # Use consistent naming pattern
+            expected_stack_name = self.get_stack_name_from_agent(agent_name)
             
-            logger.info(f"Looking for agent stack using multiple patterns: {possible_stack_names}")
+            logger.info(f"Looking for agent stack: {expected_stack_name}")
             
-            for expected_stack_name in possible_stack_names:
-                try:
-                    logger.info(f"Trying stack name pattern: {expected_stack_name}")
-                    
-                    # Try to get the stack directly using this pattern
-                    stack_info = await self._get_stack_info(expected_stack_name)
+            try:
+                # Get the stack directly using standard pattern
+                stack_info = await self._get_stack_info(expected_stack_name)
+                
+                # Verify this stack has the correct AgentName parameter
+                stack_agent_name_param = None
+                for param in stack_info.get('parameters', []):
+                    if param['ParameterKey'] == 'AgentName':
+                        stack_agent_name_param = param['ParameterValue']
+                        break
+                
+                if stack_agent_name_param == agent_name:
+                    logger.info(f"Found agent stack: {expected_stack_name} with AgentName={agent_name}")
                     
-                    # Verify this stack has the correct AgentName parameter
-                    stack_agent_name_param = None
-                    for param in stack_info.get('parameters', []):
-                        if param['ParameterKey'] == 'AgentName':
-                            stack_agent_name_param = param['ParameterValue']
+                    # Check if it's managed by our API
+                    managed_by_api = False
+                    for tag in stack_info.get('tags', []):
+                        if tag['Key'] == 'ManagedBy' and tag['Value'] == 'ConfigurationAPI':
+                            managed_by_api = True
                             break
                     
-                    if stack_agent_name_param == agent_name:
-                        logger.info(f"✅ Found exact match: {expected_stack_name} with AgentName={agent_name}")
-                        
-                        # Check if it's managed by our API
-                        managed_by_api = False
-                        for tag in stack_info.get('tags', []):
-                            if tag['Key'] == 'ManagedBy' and tag['Value'] == 'ConfigurationAPI':
-                                managed_by_api = True
-                                break
-                        
-                        return {
-                            'stack_name': stack_info['stack_name'],
-                            'stack_id': stack_info['stack_id'],
-                            'status': stack_info['status'],
-                            'agent_name': stack_agent_name_param,
-                            'managed_by_api': managed_by_api,
-                            'parameters': stack_info.get('parameters', []),
-                            'created_at': stack_info.get('creation_time'),
-                            'updated_at': stack_info.get('last_updated_time')
-                        }
-                    else:
-                        logger.warning(f"Stack {expected_stack_name} found but AgentName parameter mismatch: expected '{agent_name}', got '{stack_agent_name_param}'")
-                        continue
-                        
-                except ValueError:
-                    # Stack not found with this pattern, try next pattern
-                    logger.debug(f"No stack found with pattern: {expected_stack_name}")
-                    continue
-            
-            # If we get here, no stack was found with any pattern
-            logger.info(f"No stack found for agent '{agent_name}' using any naming pattern")
-            return None
+                    return {
+                        'stack_name': stack_info['stack_name'],
+                        'stack_id': stack_info['stack_id'],
+                        'status': stack_info['status'],
+                        'agent_name': stack_agent_name_param,
+                        'managed_by_api': managed_by_api,
+                        'parameters': stack_info.get('parameters', []),
+                        'created_at': stack_info.get('creation_time'),
+                        'updated_at': stack_info.get('last_updated_time')
+                    }
+                else:
+                    logger.warning(f"Stack {expected_stack_name} found but AgentName parameter mismatch: expected '{agent_name}', got '{stack_agent_name_param}'")
+                    return None
+                    
+            except ValueError:
+                # Stack not found with standard pattern
+                logger.info(f"No stack found for agent '{agent_name}' using pattern: {expected_stack_name}")
+                return None
                 
         except Exception as e:
             log_exception_safely(logger, e, f"Error finding agent stack for '{agent_name}'")
             return None
     
+    async def update_agent_stack(
+        self,
+        agent_name: str,
+        parameters: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        Update an existing agent stack with new parameters or template changes.
+        
+        This method:
+        1. Finds the CloudFormation stack for the agent
+        2. Retrieves the current template
+        3. Updates the stack with new parameters (if provided)
+        
+        Args:
+            agent_name: Name of the agent whose stack to update
+            parameters: Optional dictionary of CloudFormation parameters to update
+            
+        Returns:
+            Dictionary containing update information
+            
+        Raises:
+            ValueError: If agent stack not found
+            Exception: If stack update fails
+        """
+        try:
+            logger.info(f"Updating agent stack for: {agent_name}")
+            
+            # Find the stack for this agent
+            stack_info = await self.find_agent_stack_by_name(agent_name)
+            
+            if not stack_info:
+                raise ValueError(f"No CloudFormation stack found for agent '{agent_name}'")
+            
+            stack_name = stack_info['stack_name']
+            logger.info(f"Found stack '{stack_name}' for agent '{agent_name}'")
+            
+            # Fetch the latest template from S3 to pick up any CDK template changes
+            # This ensures updates include latest infrastructure improvements
+            logger.info("Fetching latest template from S3 for stack update")
+            template_from_s3 = await self._get_template_from_s3("GenericAgentTemplate.json")
+            
+            # No template modification - parameters must be explicitly provided
+            # AgentName and ImageTag are required parameters with no defaults
+            logger.info("Using template as-is - no defaults to modify")
+            
+            # Prepare update parameters with unmodified template
+            update_params = {
+                'StackName': stack_name,
+                'TemplateBody': json.dumps(template_from_s3),
+                'Capabilities': ['CAPABILITY_IAM', 'CAPABILITY_NAMED_IAM']
+            }
+            
+            # Build parameters list, preserving existing parameters and merging with new ones
+            # CRITICAL: AgentName must ALWAYS be explicitly set to prevent CloudFormation from using template default
+            
+            # Get existing parameters from stack
+            existing_params = {
+                param['ParameterKey']: param['ParameterValue']
+                for param in stack_info.get('parameters', [])
+            }
+            
+            logger.info(f"Existing parameters from stack: {existing_params}")
+            
+            # If new parameters provided, merge them with existing ones
+            if parameters:
+                existing_params.update(parameters)
+                logger.info(f"Merged {len(parameters)} new parameters with existing parameters")
+            
+            # CRITICAL: Always ensure AgentName is set to the correct agent name
+            # This is the most important parameter and must never revert to template default
+            existing_params['AgentName'] = agent_name
+            logger.info(f"Explicitly set AgentName parameter to: {agent_name}")
+            
+            # CRITICAL: Retrieve and set ImageTag from SSM to ensure ECS updates
+            # The existing parameter may contain a CDK token like ${Token[TOKEN.262]}
+            # which must be replaced with the actual full image URI with SHA256 tag from SSM
+            logger.debug(f"Current ImageTag parameter value before SSM retrieval: {existing_params.get('ImageTag', 'NOT SET')}")
+            try:
+                logger.info("Retrieving image URI from SSM for ImageTag parameter...")
+                image_uri = self._get_image_uri_from_ssm()
+                logger.debug(f"Full image URI from SSM: {image_uri}")
+                
+                # IMPORTANT: Pass the FULL image URI, not just the tag
+                # The CloudFormation template expects the complete URI with repository and SHA256 tag
+                logger.debug(f"Replacing ImageTag parameter: '{existing_params.get('ImageTag')}' -> '{image_uri}'")
+                existing_params['ImageTag'] = image_uri
+                logger.info(f"Successfully updated ImageTag parameter from SSM: {image_uri}")
+            except Exception as e:
+                logger.error(f"Failed to retrieve ImageTag from SSM: {e}")
+                logger.warning(f"Proceeding with existing ImageTag value: {existing_params.get('ImageTag', 'NOT SET')}")
+                logger.warning("ECS may not update if ImageTag hasn't changed")
+            
+            # Build CloudFormation parameter list from merged parameters
+            # Put AgentName FIRST to emphasize its importance
+            cfn_parameters = [
+                {
+                    'ParameterKey': 'AgentName',
+                    'ParameterValue': agent_name
+                }
+            ]
+            
+            # Add all other parameters
+            for key, value in existing_params.items():
+                if key != 'AgentName':  # Skip AgentName since we already added it first
+                    cfn_parameters.append({
+                        'ParameterKey': key,
+                        'ParameterValue': str(value)
+                    })
+            
+            update_params['Parameters'] = cfn_parameters
+            logger.info(f"Final parameters for CloudFormation update (AgentName={agent_name} is first): {[p['ParameterKey'] for p in cfn_parameters]}")
+            
+            # Add update tags
+            existing_tags = stack_info.get('tags', [])
+            update_tags = [tag for tag in existing_tags if not tag['Key'].startswith('aws:')]
+            update_tags.append({
+                'Key': 'LastUpdatedBy',
+                'Value': 'ConfigurationAPI'
+            })
+            update_tags.append({
+                'Key': 'LastUpdatedAt',
+                'Value': datetime.utcnow().isoformat()
+            })
+            update_params['Tags'] = update_tags
+            
+            # Execute stack update
+            # Note: EnableTerminationProtection is only valid for create_stack, not update_stack
+            try:
+                response = self.cloudformation.update_stack(**update_params)
+                stack_id = response['StackId']
+                
+                logger.info(f"Successfully initiated stack update for '{stack_name}' (with ForceNewDeployment)")
+                
+                return {
+                    'stack_name': stack_name,
+                    'stack_id': stack_id,
+                    'status': 'UPDATE_IN_PROGRESS',
+                    'agent_name': agent_name,
+                    'message': f'Stack update initiated for agent {agent_name} - ECS service will force new deployment'
+                }
+                
+            except ClientError as e:
+                error_code = e.response['Error']['Code']
+                error_message = e.response['Error']['Message']
+                
+                # Handle "No updates are to be performed" case gracefully
+                if error_code == 'ValidationError' and 'No updates are to be performed' in error_message:
+                    logger.info(f"No updates needed for stack '{stack_name}'")
+                    return {
+                        'stack_name': stack_name,
+                        'stack_id': stack_info['stack_id'],
+                        'status': stack_info['status'],
+                        'agent_name': agent_name,
+                        'message': 'Stack is already up to date - no changes needed'
+                    }
+                else:
+                    logger.error(f"CloudFormation error: {error_code} - {error_message}")
+                    raise Exception(f"Failed to update stack: {error_message}")
+            
+        except ValueError:
+            # Re-raise ValueError for agent not found
+            raise
+        except Exception as e:
+            log_exception_safely(logger, e, f"Error updating agent stack for '{agent_name}'")
+            raise
+    
+    def _get_image_uri_from_ssm(self) -> str:
+        """
+        Retrieve the agent image URI from SSM Parameter Store.
+        
+        This retrieves the SHA256-tagged image URI that was stored during
+        the CDK deployment, ensuring ECS tasks always pull the correct image version.
+        
+        Returns:
+            Image URI with SHA256 tag (e.g., "123456789.dkr.ecr.us-east-1.amazonaws.com/repo:sha256tag")
+            
+        Raises:
+            Exception: If parameter not found or retrieval fails
+        """
+        try:
+            # Get SSM client
+            ssm = boto3.client('ssm', region_name=self.region)
+            
+            # Parameter name where CDK stores the image URI
+            # This matches the actual parameter created by template_storage stack:
+            # ssm.StringParameter(parameter_name=f"/{project_name}/agent/image-uri", ...)
+            parameter_name = f"/{self.project_name}/agent/image-uri"
+            
+            logger.debug(f"Fetching SSM parameter: {parameter_name}")
+            response = ssm.get_parameter(Name=parameter_name)
+            image_uri = response['Parameter']['Value']
+            
+            logger.info(f"Successfully retrieved image URI from SSM: {image_uri}")
+            
+            # Log the expected tag extraction for debugging
+            if ':' in image_uri:
+                expected_tag = image_uri.split(':')[-1]
+                logger.debug(f"Expected tag after extraction: {expected_tag}")
+            else:
+                logger.warning("Image URI doesn't contain ':' separator - will default to 'latest'")
+            
+            return image_uri
+            
+        except ClientError as e:
+            error_code = e.response['Error']['Code']
+            if error_code == 'ParameterNotFound':
+                logger.error(f"SSM parameter not found: {parameter_name}")
+                logger.error("Ensure CDK deployment completed successfully and created this parameter")
+                logger.error(f"Check if parameter exists with: aws ssm get-parameter --name {parameter_name}")
+            else:
+                logger.error(f"AWS error retrieving SSM parameter: {error_code}")
+            raise Exception(f"Failed to retrieve image URI from SSM: {e.response['Error']['Message']}")
+        except Exception as e:
+            logger.error("Unexpected error retrieving image URI from SSM")
+            log_exception_safely(logger, e, "Error retrieving image URI from SSM")
+            raise
+    
     async def delete_stack(self, stack_name: str) -> Dict[str, Any]:
         """
         Delete a CloudFormation stack.
diff --git a/application_src/configuration-api/app/services/parameter_initialization.py b/application_src/configuration-api/app/services/parameter_initialization.py
index 754c6e5..a88bfb6 100644
--- a/application_src/configuration-api/app/services/parameter_initialization.py
+++ b/application_src/configuration-api/app/services/parameter_initialization.py
@@ -259,13 +259,13 @@ def _get_default_agent_config(self, agent_name: str) -> Dict[str, Any]:
         )
         
         # Validate the configuration before returning
-        validation_result = SSMDataValidator.validate_agent_configuration(config_model.dict())
+        validation_result = SSMDataValidator.validate_agent_configuration(config_model.model_dump(mode='json'))
         if not validation_result["valid"]:
             logger.error(f"Configuration from development.yaml is invalid: {validation_result['errors']}")
             raise ValueError(f"Invalid configuration from development.yaml: {validation_result['errors']}")
         
         logger.info(f"✅ Generated valid configuration for {agent_name} using ONLY development.yaml values")
-        return config_model.dict()
+        return config_model.model_dump(mode='json')
     
     def _get_default_supervisor_config(self, supervisor_name: str) -> Dict[str, Any]:
         """Get default supervisor agent configuration using ONLY configuration values from development.yaml."""
diff --git a/application_src/configuration-api/app/utils/dependencies.py b/application_src/configuration-api/app/utils/dependencies.py
index fdb9f74..421ed4e 100644
--- a/application_src/configuration-api/app/utils/dependencies.py
+++ b/application_src/configuration-api/app/utils/dependencies.py
@@ -10,7 +10,6 @@
 
 from ..services import SSMService, DiscoveryService, AgentConfigService
 from ..services.deployment_service import DeploymentService
-from ..services.cloudformation_deployment_service import CloudFormationDeploymentService
 
 
 @lru_cache()
@@ -67,26 +66,3 @@ def get_deployment_service() -> DeploymentService:
         Configured Deployment service instance
     """
     return DeploymentService()
-
-
-@lru_cache()
-def get_cloudformation_deployment_service() -> CloudFormationDeploymentService:
-    """
-    Get CloudFormation Deployment service instance.
-    
-    Returns:
-        Configured CloudFormation Deployment service instance
-    """
-    region = os.environ.get('AWS_REGION', 'us-east-1')
-    # Use ai-platform as project name to match CDK stack naming convention
-    project_name = os.environ.get('PROJECT_NAME', 'ai-platform')
-    template_bucket = os.environ.get('S3_TEMPLATE_BUCKET')
-
-    if not template_bucket:
-        raise ValueError("S3_TEMPLATE_BUCKET environment variable is required")
-    
-    return CloudFormationDeploymentService(
-        region=region,
-        project_name=project_name,
-        template_bucket=template_bucket
-    )
diff --git a/application_src/ui-react-cloudscape/package-lock.json b/application_src/ui-react-cloudscape/package-lock.json
index 940058b..6e83ffd 100644
--- a/application_src/ui-react-cloudscape/package-lock.json
+++ b/application_src/ui-react-cloudscape/package-lock.json
@@ -17,10 +17,12 @@
         "axios": "^1.12.2",
         "concurrently": "7.6.0",
         "cors": "^2.8.5",
+        "dompurify": "^3.3.0",
         "express": "^4.21.2",
         "express-session": "^1.18.2",
         "helmet": "^8.1.0",
         "i18next": "23.7.16",
+        "marked": "^16.4.1",
         "react": "18.2.0",
         "react-dom": "18.2.0",
         "react-i18next": "14.0.0",
@@ -11697,6 +11699,14 @@
         "url": "https://github.com/fb55/domhandler?sponsor=1"
       }
     },
+    "node_modules/dompurify": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.3.0.tgz",
+      "integrity": "sha512-r+f6MYR1gGN1eJv0TVQbhA7if/U7P87cdPl3HN5rikqaBSBxLiCb/b9O+2eG0cxz0ghyU+mU1QkbsOwERMYlWQ==",
+      "optionalDependencies": {
+        "@types/trusted-types": "^2.0.7"
+      }
+    },
     "node_modules/domutils": {
       "version": "2.8.0",
       "resolved": "https://registry.npmjs.org/domutils/-/domutils-2.8.0.tgz",
@@ -16170,6 +16180,17 @@
         "tmpl": "1.0.5"
       }
     },
+    "node_modules/marked": {
+      "version": "16.4.1",
+      "resolved": "https://registry.npmjs.org/marked/-/marked-16.4.1.tgz",
+      "integrity": "sha512-ntROs7RaN3EvWfy3EZi14H4YxmT6A5YvywfhO+0pm+cH/dnSQRmdAmoFIc3B9aiwTehyk7pESH4ofyBY+V5hZg==",
+      "bin": {
+        "marked": "bin/marked.js"
+      },
+      "engines": {
+        "node": ">= 20"
+      }
+    },
     "node_modules/math-intrinsics": {
       "version": "1.1.0",
       "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
diff --git a/application_src/ui-react-cloudscape/package.json b/application_src/ui-react-cloudscape/package.json
index e7329e2..0dc1b5f 100644
--- a/application_src/ui-react-cloudscape/package.json
+++ b/application_src/ui-react-cloudscape/package.json
@@ -12,10 +12,12 @@
     "axios": "^1.12.2",
     "concurrently": "7.6.0",
     "cors": "^2.8.5",
+    "dompurify": "^3.3.0",
     "express": "^4.21.2",
     "express-session": "^1.18.2",
     "helmet": "^8.1.0",
     "i18next": "23.7.16",
+    "marked": "^16.4.1",
     "react": "18.2.0",
     "react-dom": "18.2.0",
     "react-i18next": "14.0.0",
diff --git a/application_src/ui-react-cloudscape/server.js b/application_src/ui-react-cloudscape/server.js
index 36faccb..2bab564 100644
--- a/application_src/ui-react-cloudscape/server.js
+++ b/application_src/ui-react-cloudscape/server.js
@@ -113,6 +113,32 @@ const safeUrlEncode = (str) => {
     .substring(0, 500); // Limit length to prevent log injection
 };
 
+// Safe error serialization helper to avoid circular reference issues
+const safeErrorSerialize = (error) => {
+  if (!error) return 'Unknown error';
+  
+  try {
+    // Only serialize safe parts of error object
+    const safeError = {
+      message: error.message,
+      name: error.name,
+      status: error.response?.status,
+      statusText: error.response?.statusText,
+      data: error.response?.data,
+      config: error.config ? {
+        method: error.config.method,
+        url: error.config.url,
+        timeout: error.config.timeout
+      } : undefined
+    };
+    
+    return JSON.stringify(safeError, null, 2);
+  } catch (serializeError) {
+    // Fallback if even safe serialization fails
+    return `Error: ${error.message || 'Unknown'}, Status: ${error.response?.status || 'Unknown'}`;
+  }
+};
+
 // Enhanced security: Validate agent name parameter
 const validateAgentName = (agentName) => {
   if (!agentName || typeof agentName !== 'string') {
@@ -736,7 +762,7 @@ app.post('/api/config/system-prompts/create/:agentName', async (req, res) => {
     }
     
     console.log(`[PROXY] POST ${CONFIGURATION_API_ENDPOINT}/config/system-prompts/create/${safeUrlEncode(agentName)}`);
-    console.log('[PROXY] 🔐 Forwarding Authorization header for system prompt creation');
+    console.log('[PROXY] Forwarding Authorization header for system prompt creation');
     const response = await axios.post(`${CONFIGURATION_API_ENDPOINT}/config/system-prompts/create/${safeUrlEncode(agentName)}`, req.body, {
       headers: getAuthHeaders(req) // ← FIX: Forward auth headers for system prompt creation
     });
@@ -757,7 +783,7 @@ app.get('/api/config/discover', async (req, res) => {
     res.json(response.data);
   } catch (error) {
     console.error('[PROXY ERROR] Failed to discover services:', safeUrlEncode(error.message));
-    console.error('[PROXY ERROR] Error details:', safeUrlEncode(JSON.stringify(error.response?.data) || error.message || 'Unknown error'));
+    console.error('[PROXY ERROR] Error details:', safeUrlEncode(safeErrorSerialize(error)));
     handleAuthError(error, res, 'discover services');
   }
 });
@@ -814,7 +840,7 @@ app.post('/api/config/refresh-agent/:agentName', async (req, res) => {
     }
     
     console.log('[PROXY] POST /api/config/refresh-agent/', agentName, '->', `${CONFIGURATION_API_ENDPOINT}/config/refresh-agent/${agentName}`);
-    console.log('[PROXY] 🔐 Forwarding Authorization header for agent refresh');
+    console.log('[PROXY] Forwarding Authorization header for agent refresh');
     
     const response = await axios.post(`${CONFIGURATION_API_ENDPOINT}/config/refresh-agent/${safeUrlEncode(agentName)}`, {}, {
       headers: getAuthHeaders(req), // ← FIX: Forward auth headers for agent refresh
@@ -846,7 +872,7 @@ app.post('/api/config/agent/:agentName/reload', async (req, res) => {
     }
     
     console.log(`[PROXY] POST /api/config/agent/${agentName}/reload`);
-    console.log('[PROXY] 🔐 Forwarding Authorization header for agent reload');
+    console.log('[PROXY] Forwarding Authorization header for agent reload');
     
     // Step 1: Get agent mapping to find agent endpoint (with auth)
     const mappingResponse = await axios.get(`${CONFIGURATION_API_ENDPOINT}/agent-mapping`, {
@@ -945,7 +971,7 @@ app.post('/api/config/agent/:agentName/reload', async (req, res) => {
 app.post('/api/deployment/create-agent', async (req, res) => {
   try {
     console.log('[PROXY] POST /api/deployment/create-agent');
-    console.log('[PROXY] 🔐 Forwarding Authorization header for deployment API');
+    console.log('[PROXY] Forwarding Authorization header for deployment API');
     console.log('[PROXY] Request body:', req.body);
     
     // Forward the request directly to the Config API's create-agent endpoint
@@ -973,7 +999,7 @@ app.get('/api/deployment/stack-status/:agentName', async (req, res) => {
     }
     
     console.log(`[PROXY] GET /api/deployment/stack-status/${agentName}`);
-    console.log('[PROXY] 🔐 Forwarding Authorization header for stack status check');
+    console.log('[PROXY] Forwarding Authorization header for stack status check');
     
     const response = await axios.get(`${CONFIGURATION_API_ENDPOINT}/api/deployment/stack-status/${safeUrlEncode(agentName)}`, {
       headers: getAuthHeaders(req),
@@ -991,7 +1017,7 @@ app.get('/api/deployment/stack-status/:agentName', async (req, res) => {
 app.post('/api/deployment/refresh-agent-urls', async (req, res) => {
   try {
     console.log(`[PROXY] POST ${SUPERVISOR_AGENT_ENDPOINT}/refresh-agent-urls`);
-    console.log('[PROXY] 🔐 Forwarding Authorization header to Supervisor Agent for refresh');
+    console.log('[PROXY] Forwarding Authorization header to Supervisor Agent for refresh');
     
     const response = await axios.post(`${SUPERVISOR_AGENT_ENDPOINT}/refresh-agent-urls`, {}, {
       headers: getAuthHeaders(req),  // ← FIX: Forward auth headers for refresh
@@ -1007,11 +1033,38 @@ app.post('/api/deployment/refresh-agent-urls', async (req, res) => {
   }
 });
 
+// Stack Update proxy route (for updating agent CloudFormation stacks)
+app.put('/api/deployment/stack/:agentName', async (req, res) => {
+  const { agentName } = req.params;
+  
+  try {
+    // Security validation
+    if (!validateAgentName(agentName)) {
+      return res.status(400).json({ error: 'Invalid agent name format' });
+    }
+    
+    console.log(`[PROXY] PUT /api/deployment/stack/${agentName}`);
+    console.log('[PROXY] Forwarding Authorization header for stack update');
+    
+    const response = await axios.put(`${CONFIGURATION_API_ENDPOINT}/api/deployment/stack/${safeUrlEncode(agentName)}`, req.body, {
+      headers: getAuthHeaders(req),
+      timeout: 600000 // 10 minutes timeout for update operations
+    });
+    
+    console.log(`[PROXY] Stack update successful for ${safeUrlEncode(agentName)}`);
+    res.json(response.data);
+  } catch (error) {
+    console.error('[PROXY ERROR] Failed to update stack:', safeUrlEncode(agentName), safeUrlEncode(error.message || 'Unknown error'));
+    console.error('[PROXY ERROR] Error details:', safeUrlEncode(JSON.stringify(error.response?.data || {}) || error.message || 'Unknown error'));
+    handleAuthError(error, res, 'update agent stack');
+  }
+});
+
 // STREAMING ONLY Proxy routes for Supervisor Agent (UX Optimized) with OAuth forwarding
 app.post('/api/agent/chat', async (req, res) => {
   try {
-    console.log('[PROXY] 🌊 STREAMING-ONLY: POST /api/agent/chat -> Supervisor Agent Streaming');
-    console.log('[PROXY] 🔐 Forwarding Authorization header to Supervisor Agent');
+    console.log('[PROXY] STREAMING-ONLY: POST /api/agent/chat -> Supervisor Agent Streaming');
+    console.log('[PROXY] Forwarding Authorization header to Supervisor Agent');
     
     const response = await axios.post(`${SUPERVISOR_AGENT_ENDPOINT}/agent-streaming`, req.body, {
       headers: getAuthHeaders(req),  // ← FIX: Forward auth headers
@@ -1034,9 +1087,9 @@ app.post('/api/agent/chat', async (req, res) => {
 
 // DEPRECATED: Redirect sync calls to streaming for consistency with OAuth forwarding
 app.post('/api/agent/chat-sync', async (req, res) => {
-  console.log('[PROXY] 🚨 DEPRECATED: chat-sync called - redirecting to streaming for optimal UX');
-  console.log('[PROXY] 💡 STREAMING ENFORCED: All UI communication uses streaming');
-  console.log('[PROXY] 🔐 Forwarding Authorization header to Supervisor Agent');
+  console.log('[PROXY] DEPRECATED: chat-sync called - redirecting to streaming for optimal UX');
+  console.log('[PROXY] STREAMING ENFORCED: All UI communication uses streaming');
+  console.log('[PROXY] Forwarding Authorization header to Supervisor Agent');
   
   try {
     // Redirect to streaming endpoint with proper auth headers
@@ -1054,7 +1107,7 @@ app.post('/api/agent/chat-sync', async (req, res) => {
       });
       
       response.data.on('end', () => {
-        console.log('[PROXY] 💡 Streaming->Sync conversion complete');
+        console.log('[PROXY] Streaming->Sync conversion complete');
         res.json({ response: completeResponse });
         resolve();
       });
@@ -1083,7 +1136,7 @@ app.delete('/api/config/delete/:agentName', async (req, res) => {
     }
     
     console.log(`[PROXY] DELETE /api/config/delete/${agentName} -> ${CONFIGURATION_API_ENDPOINT}/config/delete/${agentName}`);
-    console.log('[PROXY] 🔐 Forwarding Authorization header for agent deletion');
+    console.log('[PROXY] Forwarding Authorization header for agent deletion');
     
     const response = await axios.delete(`${CONFIGURATION_API_ENDPOINT}/config/delete/${safeUrlEncode(agentName)}`, {
       headers: getAuthHeaders(req) // ← FIX: Forward auth headers for deletion
@@ -1111,7 +1164,7 @@ app.delete('/api/config/delete-complete/:agentName', async (req, res) => {
     
     console.log(`[PROXY] DELETE /api/config/delete-complete/${agentName} -> ${CONFIGURATION_API_ENDPOINT}/config/delete-complete/${agentName}`);
     console.log(`[PROXY] Include infrastructure: ${includeInfrastructure}`);
-    console.log('[PROXY] 🔐 Forwarding Authorization header for complete agent deletion');
+    console.log('[PROXY] Forwarding Authorization header for complete agent deletion');
     
     const response = await axios.delete(`${CONFIGURATION_API_ENDPOINT}/config/delete-complete/${safeUrlEncode(agentName)}`, {
       params: {
@@ -1274,18 +1327,18 @@ app.get('*', (req, res) => {
 const HOST = '0.0.0.0';
 
 app.listen(PORT, HOST, () => {
-  console.log(`🚀 React UI Backend Server running on ${HOST}:${PORT}`);
-  console.log(`📡 Configuration API: ${CONFIGURATION_API_ENDPOINT}`);
-  console.log(`🤖 Supervisor Agent: ${SUPERVISOR_AGENT_ENDPOINT}`);
-  console.log(`🔧 NODE_ENV: ${process.env.NODE_ENV || 'undefined'}`);
+  console.log(`React UI Backend Server running on ${HOST}:${PORT}`);
+  console.log(`Configuration API: ${CONFIGURATION_API_ENDPOINT}`);
+  console.log(`Supervisor Agent: ${SUPERVISOR_AGENT_ENDPOINT}`);
+  console.log(`NODE_ENV: ${process.env.NODE_ENV || 'undefined'}`);
   console.log(` Current directory: ${__dirname}`);
   
   const fs = require('fs');
   try {
     const buildExists = fs.existsSync(path.join(__dirname, 'build'));
-    console.log(`📦 Build directory exists: ${buildExists}`);
+    console.log(`Build directory exists: ${buildExists}`);
   } catch (e) {
-    console.log(`📦 Error checking build directory: ${e.message}`);
+    console.log(`Error checking build directory: ${e.message}`);
   }
 });
 
diff --git a/application_src/ui-react-cloudscape/src/components/AgentMapping.js b/application_src/ui-react-cloudscape/src/components/AgentMapping.js
index d916440..403e6f5 100644
--- a/application_src/ui-react-cloudscape/src/components/AgentMapping.js
+++ b/application_src/ui-react-cloudscape/src/components/AgentMapping.js
@@ -46,6 +46,10 @@ const AgentMapping = ({
   const [showDeleteConfirmation, setShowDeleteConfirmation] = useState(false);
   const [selectedSkillDetails, setSelectedSkillDetails] = useState(null);
   const [showSkillDetails, setShowSkillDetails] = useState(false);
+  
+  // Update agent state
+  const [updatingAgent, setUpdatingAgent] = useState(null);
+  const [updateSuccess, setUpdateSuccess] = useState(null);
 
   useEffect(() => {
     if (isOpen) {
@@ -444,6 +448,43 @@ const AgentMapping = ({
     setSelectedSkillDetails(null);
   };
 
+  // Handle CloudFormation stack update
+  const handleUpdateAgent = async (agentName) => {
+    if (agentName === 'supervisor-agent') {
+      setError('Cannot update supervisor agent stack - it is managed by the main CDK deployment');
+      return;
+    }
+    
+    setUpdatingAgent(agentName);
+    setError(null);
+    setUpdateSuccess(null);
+    
+    try {
+      const configService = await import('../services/configuration');
+      
+      // Trigger CloudFormation stack update
+      const result = await configService.default.updateAgentStack(agentName);
+      
+      setUpdateSuccess(`Successfully updated stack for agent: ${agentName}`);
+      
+      // Refresh agent mapping after successful update
+      setTimeout(async () => {
+        await loadAgentMappings();
+        setUpdateSuccess(null);
+      }, 3000);
+      
+    } catch (error) {
+      if (error.message.includes('No updates needed')) {
+        setUpdateSuccess(`Agent ${agentName} is already up to date`);
+        setTimeout(() => setUpdateSuccess(null), 3000);
+      } else {
+        setError(`Failed to update agent ${agentName}: ${error.message}`);
+      }
+    } finally {
+      setUpdatingAgent(null);
+    }
+  };
+
   return (
     <Modal
       visible={isOpen}
@@ -511,6 +552,18 @@ const AgentMapping = ({
           </Alert>
         )}
 
+        {updateSuccess && (
+          <Alert 
+            type="success" 
+            header="Update Successful"
+            dismissible 
+            onDismiss={() => setUpdateSuccess(null)}
+            statusIconAriaLabel="Success"
+          >
+            {updateSuccess}
+          </Alert>
+        )}
+
         {/* AWS Foundation Visual Context - Network Overview */}
         {agentMappingData?.summary && (
           <Container 
@@ -1073,7 +1126,7 @@ const AgentMapping = ({
                       content: item => (
                         /* AWS Foundation Visual Context - Agent Actions */
                         <SpaceBetween size="s">
-                          <SpaceBetween direction="horizontal" size="s">
+                          <SpaceBetween direction="horizontal" size="s" wrap={true}>
                             <CanUpdateAgents>
                               <Button
                                 onClick={() => onOpenAgentWizard(item.name)}
@@ -1086,6 +1139,27 @@ const AgentMapping = ({
                               </Button>
                             </CanUpdateAgents>
                             
+                            <CanUpdateAgents>
+                              {item.name !== 'supervisor-agent' && (
+                                <SpaceBetween direction="vertical" size="xxs">
+                                  <Button
+                                    onClick={() => handleUpdateAgent(item.name)}
+                                    iconName="upload"
+                                    variant="normal"
+                                    size="small"
+                                    loading={updatingAgent === item.name}
+                                    disabled={updatingAgent !== null}
+                                    ariaLabel={`Update CloudFormation stack for ${item.name}`}
+                                  >
+                                    {updatingAgent === item.name ? 'Updating...' : 'Update Stack'}
+                                  </Button>
+                                  <Box variant="small" color="text-body-secondary" textAlign="center">
+                                    Sync infrastructure with latest config
+                                  </Box>
+                                </SpaceBetween>
+                              )}
+                            </CanUpdateAgents>
+                            
                             <CanDeleteAgents>
                               {item.canDelete !== false && (
                                 <Button
@@ -1107,7 +1181,7 @@ const AgentMapping = ({
                             <SpaceBetween direction="horizontal" size="xs" alignItems="center">
                               <Icon name="status-warning" />
                               <Box variant="small" color="text-status-warning">
-                                System agent - cannot be deleted
+                                System agent - managed by main CDK deployment
                               </Box>
                             </SpaceBetween>
                           )}
diff --git a/application_src/ui-react-cloudscape/src/components/AgentWizard.js b/application_src/ui-react-cloudscape/src/components/AgentWizard.js
index af1dda0..f02bbdc 100644
--- a/application_src/ui-react-cloudscape/src/components/AgentWizard.js
+++ b/application_src/ui-react-cloudscape/src/components/AgentWizard.js
@@ -588,41 +588,103 @@ const AgentWizard = ({
         region_name: config.region_name || 'us-east-1'
       };
       
-      // Map model configuration
-      if (config.model_id !== undefined) {
-        transformedData['models_bedrock_model_id'] = config.model_id;
-      }
-      if (config.embedding_model_id !== undefined) {
-        transformedData['models_bedrock_embedding_model_id'] = config.embedding_model_id;
-      }
-      if (config.temperature !== undefined) {
-        transformedData['models_bedrock_temperature'] = config.temperature;
-      }
-      if (config.top_p !== undefined) {
-        transformedData['models_bedrock_top_p'] = config.top_p;
-      }
-      
-      // Handle model_ids array for multi-select dropdown - Issue #2
-      if (config.model_ids && Array.isArray(config.model_ids)) {
-        transformedData['models_bedrock_model_ids'] = config.model_ids;
-      }
-      
-      transformedData['models_enabled'] = true;
-      transformedData['models_provider'] = 'bedrock';
-      
-      // Handle other components
-      ['tools', 'knowledge_base', 'memory', 'observability', 'guardrail'].forEach(componentType => {
+      // Handle ALL components with consistent provider_details pattern
+      ['models', 'tools', 'knowledge_base', 'memory', 'observability', 'guardrail'].forEach(componentType => {
         const enabledValue = config[componentType];
-        
+
+        // Set enabled status
         if (componentType === 'tools' && Array.isArray(enabledValue)) {
           transformedData[`${componentType}_enabled`] = enabledValue.length > 0;
+        } else if (componentType === 'models') {
+          transformedData['models_enabled'] = true;
         } else {
-          transformedData[`${componentType}_enabled`] = 
+          transformedData[`${componentType}_enabled`] =
             enabledValue === 'True' || enabledValue === true || enabledValue === 'enabled';
         }
-        
+
+        // Set provider name
         if (config[`${componentType}_provider`]) {
           transformedData[`${componentType}_provider`] = config[`${componentType}_provider`];
+        } else if (componentType === 'models') {
+          transformedData['models_provider'] = 'bedrock';
+        }
+
+        // Special handling for tools - uses direct 'tools' array instead of 'tools_provider_details'
+        if (componentType === 'tools' && Array.isArray(config.tools) && config.tools.length > 0) {
+          // Infer provider from tool type - default to 'builtin' for standard tools
+          const inferredProvider = 'builtin';
+          transformedData[`${componentType}_provider`] = inferredProvider;
+          
+          config.tools.forEach(toolConfig => {
+            const toolName = toolConfig.name;
+            const toolConfigValues = toolConfig.config || {};
+            
+            // Map each tool config value to form field
+            Object.entries(toolConfigValues).forEach(([configKey, configValue]) => {
+              const formFieldKey = `tools_${inferredProvider}_${toolName}_${configKey}`;
+              transformedData[formFieldKey] = configValue;
+            });
+            
+            // Mark the tool as enabled
+            transformedData[`tools_${inferredProvider}_${toolName}_enabled`] = true;
+          });
+        }
+        // Standard provider_details pattern for other components
+        else {
+          const providerDetailsKey = componentType === 'knowledge_base' ? 
+            `${componentType}_details` : 
+            `${componentType}_provider_details`;
+          
+          const providerDetails = config[providerDetailsKey];
+          
+          if (providerDetails && Array.isArray(providerDetails)) {
+            // Standard provider_details format
+            providerDetails.forEach(providerConfig => {
+              const providerName = providerConfig.name;
+              const providerConfigValues = providerConfig.config || {};
+              
+              // Map each config value to form field
+              Object.entries(providerConfigValues).forEach(([configKey, configValue]) => {
+                const formFieldKey = `${componentType}_${providerName}_${configKey}`;
+                transformedData[formFieldKey] = configValue;
+              });
+            });
+          } else {
+            // FALLBACK: Handle flat configuration format for ALL components
+            // This handles cases where SSM stores config without provider_details array structure
+            const providerName = config[`${componentType}_provider`] || 
+                                 (componentType === 'models' ? 'bedrock' : 'default');
+            
+            // Extract all fields that might be stored at the root level for this component
+            // Look for patterns like: model_id, embedding_model_id, api_key, endpoint, etc.
+            Object.entries(config).forEach(([key, value]) => {
+              // Skip the provider_details keys themselves
+              if (key === providerDetailsKey || key === `${componentType}_provider` || key === componentType) {
+                return;
+              }
+              
+              // Check if this key might belong to this component
+              // Common patterns: model_id, judge_model_id, embedding_model_id (for models)
+              //                  api_key, endpoint, url (for observability, memory, etc.)
+              const componentSpecificPatterns = {
+                models: ['model_id', 'judge_model_id', 'embedding_model_id', 'temperature', 'top_p', 'model_ids'],
+                observability: ['api_key', 'endpoint', 'project_id', 'environment', 'service_name', 'url', 'token'],
+                memory: ['endpoint', 'index_name', 'api_key', 'url', 'collection_name'],
+                knowledge_base: ['knowledge_base_id', 'index_name', 'endpoint', 'data_source'],
+                guardrail: ['guardrail_id', 'guardrail_version', 'threshold']
+              };
+              
+              const patterns = componentSpecificPatterns[componentType] || [];
+              const matchesPattern = patterns.some(pattern => 
+                key.toLowerCase().includes(pattern.toLowerCase())
+              );
+              
+              if (matchesPattern && value !== undefined) {
+                const formFieldKey = `${componentType}_${providerName}_${key}`;
+                transformedData[formFieldKey] = value;
+              }
+            });
+          }
         }
       });
       
@@ -1150,7 +1212,8 @@ const AgentWizard = ({
       !['agent', 'models', 'tools'].includes(component.type)
     );
     
-    validComponents.forEach(componentType => {
+    validComponents.forEach(component => {
+      const componentType = component.type;
       const isEnabled = agentData[`${componentType}_enabled`];
       const selectedProvider = agentData[`${componentType}_provider`];
       
@@ -1168,22 +1231,28 @@ const AgentWizard = ({
       }
       
       if (isEnabled && selectedProvider) {
-        // Build provider details array - only include selected provider with values
+        // Build provider details array dynamically - ALWAYS include to match handleCreateAgent
         const providerDetails = [];
         
         // Get all available providers for this component
         componentProviders.forEach(provider => {
           const providerConfig = extractProviderConfig(componentType, provider.name);
           
-          // Only include configuration if this is the selected provider AND has values
-          if (provider.name === selectedProvider) {
-            const hasValues = Object.values(providerConfig).some(value => value && value !== '');
-            if (hasValues) {
-              providerDetails.push({
-                name: provider.name,
-                config: providerConfig
-              });
-            }
+          // CRITICAL FIX: Always include provider config if it's the selected provider OR has values
+          // This matches handleCreateAgent behavior and ensures data persistence
+          const hasValues = Object.values(providerConfig).some(value => value && value !== '');
+          
+          if (provider.name === selectedProvider || hasValues) {
+            providerDetails.push({
+              name: provider.name,
+              config: providerConfig
+            });
+          } else {
+            // Include empty config for non-selected providers to maintain structure
+            providerDetails.push({
+              name: provider.name,
+              config: {}
+            });
           }
         });
         
diff --git a/application_src/ui-react-cloudscape/src/components/ChatInterface.js b/application_src/ui-react-cloudscape/src/components/ChatInterface.js
index 65973a7..85545ad 100644
--- a/application_src/ui-react-cloudscape/src/components/ChatInterface.js
+++ b/application_src/ui-react-cloudscape/src/components/ChatInterface.js
@@ -13,71 +13,52 @@ import Flashbar from '@cloudscape-design/components/flashbar';
 import Badge from '@cloudscape-design/components/badge';
 import Cards from '@cloudscape-design/components/cards';
 import Icon from '@cloudscape-design/components/icon';
+import TextContent from '@cloudscape-design/components/text-content';
+import { marked } from 'marked';
+import DOMPurify from 'dompurify';
 
-// SECURITY: Simple text validation - No HTML filtering needed since using safe React rendering
-const validateContent = (content) => {
+// Configure marked options for better rendering
+marked.setOptions({
+  breaks: true, // Convert line breaks to <br>
+  gfm: true, // GitHub Flavored Markdown
+  headerIds: false, // Disable header IDs for security
+  mangle: false, // Don't escape autolinked email addresses
+});
+
+// SECURITY: Enhanced markdown formatter with proper parsing and sanitization
+const formatMessage = (content) => {
   if (!content || typeof content !== 'string') {
     return '';
   }
-  
-  // Simple length and basic safety checks
-  if (content.length > 50000) {
-    return content.substring(0, 50000) + '... (truncated)';
-  }
-  
-  return content;
-};
 
-// SECURITY: Enhanced HTML escape function to prevent XSS attacks
-const escapeHtml = (unsafe) => {
-  if (!unsafe || typeof unsafe !== 'string') {
-    return unsafe;
+  // Validate content length
+  if (content.length > 50000) {
+    content = content.substring(0, 50000) + '... (truncated)';
   }
-  
-  return unsafe
-    .replace(/&/g, "&amp;")
-    .replace(/</g, "&lt;")
-    .replace(/>/g, "&gt;")
-    .replace(/"/g, "&quot;")
-    .replace(/'/g, "&#039;")
-    .replace(/\//g, "&#x2F;")
-    .replace(/\n/g, "&#10;")
-    .replace(/\r/g, "&#13;")
-    .replace(/\t/g, "&#9;");
-};
 
-// Enhanced markdown formatter for GenAI responses with HTML sanitization
-const formatMessage = (content) => {
-  if (!content || typeof content !== 'string') {
-    return content;
+  try {
+    // Parse markdown to HTML using marked
+    const rawHtml = marked.parse(content);
+    
+    // SECURITY: Sanitize HTML with DOMPurify to prevent XSS
+    const cleanHtml = DOMPurify.sanitize(rawHtml, {
+      ALLOWED_TAGS: [
+        'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+        'p', 'br', 'strong', 'em', 'u', 's', 'code', 'pre',
+        'ul', 'ol', 'li', 'blockquote',
+        'table', 'thead', 'tbody', 'tr', 'th', 'td',
+        'a', 'img'
+      ],
+      ALLOWED_ATTR: ['href', 'target', 'rel', 'src', 'alt', 'title'],
+      ALLOW_DATA_ATTR: false
+    });
+    
+    return cleanHtml;
+  } catch (error) {
+    console.error('Error formatting markdown:', error);
+    // Fallback to escaped plain text if parsing fails
+    return DOMPurify.sanitize(content);
   }
-
-  // SECURITY: Validate and sanitize content first
-  const validatedContent = validateContent(content);
-
-  // First escape all HTML to prevent XSS
-  let safeContent = escapeHtml(validatedContent);
-
-  // Then apply safe markdown formatting using escaped HTML
-  let formatted = safeContent
-    .replace(/```([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
-    .replace(/`([^`]+)`/g, '<code>$1</code>')
-    .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
-    .replace(/\*(.*?)\*/g, '<em>$1</em>')
-    .replace(/^### (.*$)/gim, '<h4>$1</h4>')
-    .replace(/^## (.*$)/gim, '<h3>$1</h3>')
-    .replace(/^# (.*$)/gim, '<h2>$1</h2>')
-    .replace(/^\d+\.\s+(.*)$/gim, '<li>$1</li>')
-    .replace(/^[-*+]\s+(.*)$/gim, '<li>$1</li>')
-    .replace(/\n\n/g, '<br/><br/>')
-    .replace(/\n/g, '<br/>');
-
-  formatted = formatted
-    .replace(/(<li>.*?<\/li>)(<br\/>)*(?=<li>)/g, '$1')
-    .replace(/(<li>.*?<\/li>(?:<br\/>)*)+/g, '<ul>$&</ul>')
-    .replace(/<ul>(<li>.*?<\/li>(?:<br\/>)*)<\/ul>/g, '<ul>$1</ul>');
-
-  return formatted;
 };
 
 const ChatInterface = ({ 
@@ -409,7 +390,6 @@ const ChatInterface = ({
                   color="text-body-default"
                 >
                   <Box 
-                    variant="code"
                     style={{ 
                       lineHeight: '1.6',
                       wordBreak: 'break-word',
@@ -417,8 +397,16 @@ const ChatInterface = ({
                       whiteSpace: 'pre-wrap'
                     }}
                   >
-                    {/* SECURITY FIX: Use safe text rendering instead of dangerouslySetInnerHTML */}
-                    {item.content}
+                    {/* SECURITY: Using formatMessage which escapes HTML before applying markdown formatting */}
+                    {item.isUser ? (
+                      // User messages: render as plain text
+                      <Box variant="code">{item.content}</Box>
+                    ) : (
+                      // AI messages: render formatted markdown with Cloudscape TextContent component
+                      <TextContent>
+                        <div dangerouslySetInnerHTML={{ __html: formatMessage(item.content) }} />
+                      </TextContent>
+                    )}
                   </Box>
                 </Box>
               </Container>
diff --git a/application_src/ui-react-cloudscape/src/services/configuration.js b/application_src/ui-react-cloudscape/src/services/configuration.js
index 32b6536..6724b54 100644
--- a/application_src/ui-react-cloudscape/src/services/configuration.js
+++ b/application_src/ui-react-cloudscape/src/services/configuration.js
@@ -585,6 +585,27 @@ class ConfigurationService {
     return agentName === 'supervisor_agent' || agentName === 'supervisor';
   }
 
+  // CloudFormation Stack Update Method
+  async updateAgentStack(agentName, parameters = {}, timeoutMinutes = 30) {
+    try {
+      const response = await this.api.put(`/api/deployment/stack/${agentName}`, parameters, {
+        params: {
+          timeout_minutes: timeoutMinutes
+        }
+      });
+      return response.data;
+    } catch (error) {
+      // Handle specific error cases
+      if (error.response?.status === 404) {
+        throw new Error(`Stack not found for agent '${agentName}'`);
+      }
+      if (error.response?.data?.detail?.includes('No updates')) {
+        throw new Error('No updates needed - stack is already up to date');
+      }
+      throw new Error(`Failed to update agent stack: ${error.response?.data?.detail || error.message}`);
+    }
+  }
+
 }
 
 export default new ConfigurationService();
diff --git a/stacks/common/base.py b/stacks/common/base.py
index b7dd412..3718099 100644
--- a/stacks/common/base.py
+++ b/stacks/common/base.py
@@ -723,8 +723,15 @@ def _create_ecs_service(self,
                            security_group: ec2.SecurityGroup,
                            desired_count: int,
                            lattice_resources: Dict[str, Any],
-                           port: int) -> ecs.CfnService:
-        """Create ECS service with VPC Lattice configuration and fast rolling deployment behavior."""
+                           port: int,
+                           force_new_deployment: bool = False) -> ecs.CfnService:
+        """Create ECS service with VPC Lattice configuration and fast rolling deployment behavior.
+        
+        Args:
+            force_new_deployment: When True, forces a new deployment even without task definition changes.
+                                 This enables CloudFormation stack updates to trigger redeployment (e.g., for update button).
+                                 Default is False for non-agent services.
+        """
         # Use rolling deployment with 0% minimum for fastest deployments
         # This allows all previous tasks to be killed during deployment for speed
         minimum_healthy_percent = 0  # Allow all tasks to be replaced for fastest deployment
@@ -762,6 +769,7 @@ def _create_ecs_service(self,
             ),
             deployment_configuration=deployment_config,
             health_check_grace_period_seconds=15,  # Reduced for faster deployments
+            force_new_deployment=force_new_deployment,  # Configurable per service type
             vpc_lattice_configurations=[
                 ecs.CfnService.VpcLatticeConfigurationProperty(
                     role_arn=lattice_resources["ecs_service_role"].role_arn,
@@ -793,15 +801,18 @@ def _configure_task_permissions(self,
         if service_type:
             self._apply_permissions_boundary_to_task_role(task_definition.task_role, service_type)
         
+        # EXECUTION ROLE permissions (needed for ECS agent to start containers)
+        self.add_ecr_permissions(task_definition.execution_role)  # ← FIXED: ECR permissions go to execution role
+        self.add_logs_permissions(task_definition.execution_role, log_group_arns)  # ← FIXED: Logging goes to execution role
+        
+        # TASK ROLE permissions (needed by application running inside container)
         self.add_bedrock_permissions(task_definition.task_role)
         # Get KMS key from KMS stack via direct reference (no import/export needed)
         # This is more reliable than import/export and avoids naming mismatches
         kms_key_arn = self._get_kms_key_arn()
         self.add_ssm_permissions(task_definition.task_role, kms_key_arn=kms_key_arn)
-        self.add_logs_permissions(task_definition.task_role, log_group_arns)
         self.add_vpc_lattice_permissions(task_definition.task_role)
         self.add_ecs_task_permissions(task_definition.task_role)
-        self.add_ecr_permissions(task_definition.task_role)
         self.add_ec2_network_permissions(task_definition.task_role)
         # Aurora database integration permissions
         self.add_rds_data_permissions(task_definition.task_role)
@@ -1074,7 +1085,8 @@ def _create_alb_ecs_service(self,
                                task_definition: ecs.FargateTaskDefinition,
                                security_group: ec2.SecurityGroup,
                                desired_count: int,
-                               target_group: elbv2.ApplicationTargetGroup) -> ecs.FargateService:
+                               target_group: elbv2.ApplicationTargetGroup,
+                               force_new_deployment: bool = False) -> ecs.FargateService:
         """Create ECS service with ALB integration."""
         # Circuit breaker is always enabled with rollback for reliability
         minimum_healthy_percent = self.get_optional_config('MinimumHealthyPercent', DEFAULT_MINIMUM_HEALTHY_PERCENT_ROLLING)
@@ -1099,6 +1111,11 @@ def _create_alb_ecs_service(self,
         # Attach the service to the target group
         service.attach_to_application_target_group(target_group)
         
+        # Set force_new_deployment on the underlying CFN resource if enabled
+        if force_new_deployment:
+            cfn_service = service.node.default_child
+            cfn_service.force_new_deployment = True
+        
         return service
     
     def _configure_alb_task_permissions(self,
@@ -1107,12 +1124,16 @@ def _configure_alb_task_permissions(self,
         """Configure IAM permissions for the task (includes VPC Lattice permissions for service discovery)."""
         # Get KMS key ARN using direct reference instead of import/export
         kms_key_arn = self._get_kms_key_arn()
+        
+        # EXECUTION ROLE permissions (needed for ECS agent to start containers)
+        self.add_ecr_permissions(task_definition.execution_role)  # ← FIXED: ECR permissions go to execution role
+        self.add_logs_permissions(task_definition.execution_role, log_group_arns)  # ← FIXED: Logging goes to execution role
+        
+        # TASK ROLE permissions (needed by application running inside container)
         self.add_bedrock_permissions(task_definition.task_role)
         self.add_ssm_permissions(task_definition.task_role, kms_key_arn=kms_key_arn)
-        self.add_logs_permissions(task_definition.task_role, log_group_arns)
         self.add_vpc_lattice_permissions(task_definition.task_role)  # Added for service discovery
         self.add_ecs_task_permissions(task_definition.task_role)
-        self.add_ecr_permissions(task_definition.task_role)
         self.add_ec2_network_permissions(task_definition.task_role)
         # Aurora database integration permissions
         self.add_rds_data_permissions(task_definition.task_role)
@@ -1152,7 +1173,8 @@ def create_alb_vpc_lattice_fargate_service(self,
                                               environment_vars: Optional[Dict[str, str]] = None,
                                               platform: ecr_assets.Platform = ecr_assets.Platform.LINUX_ARM64,
                                               dockerfile_path: Optional[str] = None,
-                                              vpc_lattice_service_name: Optional[str] = None) -> Dict[str, Any]:
+                                              vpc_lattice_service_name: Optional[str] = None,
+                                              force_new_deployment: bool = False) -> Dict[str, Any]:
         """
         Create a Fargate service with ALB and VPC Lattice integration.
         
@@ -1220,7 +1242,7 @@ def create_alb_vpc_lattice_fargate_service(self,
             # Create ECS service that targets the ALB
             ecs_service = self._create_alb_ecs_service(
                 service_name, task_definition, alb_resources["security_group"],
-                desired_count, alb_resources["target_group"]
+                desired_count, alb_resources["target_group"], force_new_deployment
             )
             
             # Configure IAM permissions
diff --git a/stacks/multi_agent/stack.py b/stacks/multi_agent/stack.py
index f8a3184..b3871d3 100644
--- a/stacks/multi_agent/stack.py
+++ b/stacks/multi_agent/stack.py
@@ -5,7 +5,8 @@
     aws_ecs as ecs,
     aws_ec2 as ec2,
     aws_s3 as s3,
-    aws_ecr_assets as ecr_assets
+    aws_ecr_assets as ecr_assets,
+    aws_logs as logs
 )
 
 from helper.config import Config
@@ -123,15 +124,22 @@ def __init__(self,
         # Note: self.region is a read-only property from Stack, don't try to set it
         
         # Create CloudFormation parameter for agent name
-        # Preserve original agent name for SSM parameters and internal references
+        # No default value - must be explicitly provided during stack deployment
         self.agent_name_parameter = cdk.CfnParameter(
             self, "AgentName",
             type="String",
             description="Name of the agent instance (used for SSM configuration and environment variables)",
-            default=agent_name,
             allowed_pattern=r"^[a-z0-9_-]+$",
             constraint_description="Agent name must contain only lowercase letters, numbers, underscores, and hyphens"
         )
+        
+        # Create CloudFormation parameter for Docker image tag  
+        # No default value - must be explicitly provided with proper SHA256 tag
+        self.image_tag_parameter = cdk.CfnParameter(
+            self, "ImageTag",
+            type="String",
+            description="Docker image tag for the agent container - must be full ECR URI with SHA256 hash"
+        )
 
         # Get configuration values
         agent_cpu = self.get_optional_config('AgentCPU', DEFAULT_CPU)
@@ -150,7 +158,7 @@ def __init__(self,
         # This places ALB between VPC Lattice and ECS tasks to solve the 60-second timeout limitation
         # Use generic name for CDK construct IDs to make CloudFormation template reusable
         # CloudFormation parameter is used for actual AWS resource names and environment variables
-        resources = self.create_alb_vpc_lattice_fargate_service(
+        resources = self._create_agent_service_with_versioned_image(
             service_name="agent",  # Use static name for CDK construct IDs
             container_image_path=container_image_path,
             port=container_port,
@@ -161,7 +169,8 @@ def __init__(self,
             environment_vars=environment_vars,
             platform=ecr_assets.Platform.LINUX_ARM64,
             dockerfile_path=dockerfile_path,
-            vpc_lattice_service_name=self.agent_name_parameter.value_as_string  # Pass parameter for VPC Lattice service name
+            vpc_lattice_service_name=self.agent_name_parameter.value_as_string,  # Pass parameter for VPC Lattice service name
+            force_new_deployment=True  # Enable update button for CloudFormation stack updates
         )
         
         # Add Bedrock AgentCore Memory permissions to agent task definition
@@ -373,3 +382,134 @@ def _apply_agent_service_permissions_boundary(self, task_definition: ecs.Fargate
             import logging
             logger = logging.getLogger(__name__)
             logger.warning(f"Failed to apply agent service permissions boundary: {str(e)}")
+    
+    def _create_agent_service_with_versioned_image(self, **kwargs) -> Dict:
+        """
+        Create agent service using a versioned ECR image reference instead of building from assets.
+        This enables proper version control and update triggering when the image tag changes.
+        
+        Args:
+            **kwargs: All arguments to pass to create_alb_vpc_lattice_fargate_service
+            
+        Returns:
+            Dictionary containing all created resources
+        """
+        from aws_cdk import aws_logs as logs
+        
+        # Extract parameters we need to handle specially
+        service_name = kwargs['service_name']
+        port = kwargs['port']
+        cpu = kwargs['cpu']
+        memory = kwargs['memory']
+        platform = kwargs.get('platform', ecr_assets.Platform.LINUX_ARM64)
+        environment_vars = kwargs.get('environment_vars', {})
+        health_check_path = kwargs.get('health_check_path', '/health')
+        desired_count = kwargs.get('desired_count', 1)
+        vpc_lattice_service_name = kwargs.get('vpc_lattice_service_name')
+        force_new_deployment = kwargs.get('force_new_deployment', False)
+        
+        # Create log group
+        log_group = self.create_log_group(f"{service_name}-service")
+        
+        # Create ALB resources
+        alb_resources = self._create_alb_resources_for_vpc_lattice(
+            service_name, port, health_check_path
+        )
+        
+        # Create VPC Lattice service
+        lattice_resources = self._create_vpc_lattice_service_for_alb(
+            service_name, alb_resources["load_balancer"],
+            health_check_path, vpc_lattice_service_name
+        )
+        
+        # Create task definition
+        task_definition = self._create_task_definition(
+            service_name, cpu, memory, platform
+        )
+        
+        # Add container with versioned image reference
+        container = self._add_versioned_container_to_task(
+            task_definition, service_name, port, log_group,
+            environment_vars, lattice_resources["service"].attr_dns_entry_domain_name
+        )
+        
+        # Create ECS service
+        ecs_service = self._create_alb_ecs_service(
+            service_name, task_definition, alb_resources["security_group"],
+            desired_count, alb_resources["target_group"], force_new_deployment
+        )
+        
+        # Configure IAM permissions
+        self._configure_alb_task_permissions(
+            task_definition, [log_group.log_group_arn]
+        )
+        
+        # Create CloudFormation output
+        self._create_service_output(service_name, lattice_resources["service"])
+        
+        # Add tags
+        self.add_common_tags(ecs_service, {
+            "ServiceName": service_name,
+            "ServiceType": "Fargate",
+            "LoadBalancer": "ALB",
+            "ServiceMesh": "VPCLattice"
+        })
+        
+        return {
+            "ecs_service": ecs_service,
+            "task_definition": task_definition,
+            "container": container,
+            "log_group": log_group,
+            **alb_resources,
+            **lattice_resources
+        }
+    
+    def _add_versioned_container_to_task(self,
+                                        task_definition: ecs.FargateTaskDefinition,
+                                        service_name: str,
+                                        port: int,
+                                        log_group: logs.LogGroup,
+                                        environment_vars: Dict[str, str],
+                                        lattice_dns: str) -> ecs.ContainerDefinition:
+        """
+        Add container to task definition using the ImageTag parameter.
+        
+        The ImageTag parameter will contain the full CDK-generated image URI including:
+        - CDK bootstrap ECR repository
+        - SHA256 hash tag
+        
+        This ensures each rebuild gets a unique hash, forcing ECS to pull the new image.
+        
+        Args:
+            task_definition: The task definition to add container to
+            service_name: Name of the service
+            port: Container port
+            log_group: Log group for container logs
+            environment_vars: Environment variables for the container
+            lattice_dns: VPC Lattice DNS name
+            
+        Returns:
+            The created container definition
+        """
+        env_vars = environment_vars or {}
+        env_vars["HOSTED_DNS"] = lattice_dns
+        env_vars["SERVICE_NAME"] = service_name
+        env_vars["ECS_CONTAINER_STOP_TIMEOUT"] = "2s"
+        
+        # Use the ImageTag parameter directly - it contains the full CDK image URI with SHA256 hash
+        # Example: 292226546026.dkr.ecr.us-east-1.amazonaws.com/cdk-hnb659fds-container-assets-292226546026-us-east-1:9f846af5ece47ba5acecb3e381d65441b31f7f0fbd03f513e4f9aabd427c8659
+        return task_definition.add_container(
+            f"{service_name}-container",
+            image=ecs.ContainerImage.from_registry(self.image_tag_parameter.value_as_string),
+            logging=ecs.LogDrivers.aws_logs(
+                log_group=log_group,
+                stream_prefix=f'{service_name}-service',
+                mode=ecs.AwsLogDriverMode.NON_BLOCKING
+            ),
+            port_mappings=[ecs.PortMapping(
+                container_port=port,
+                protocol=ecs.Protocol.TCP
+            )],
+            environment=env_vars,
+            stop_timeout=cdk.Duration.seconds(2)
+        )
diff --git a/stacks/template_storage/stack.py b/stacks/template_storage/stack.py
index db5b37c..e29835b 100644
--- a/stacks/template_storage/stack.py
+++ b/stacks/template_storage/stack.py
@@ -3,8 +3,16 @@
 
 Creates S3 bucket for storing CloudFormation templates that are used
 by the Configuration API to deploy agent stacks dynamically.
+
+Also manages the Docker image asset for the generic agent, ensuring both
+the template and the Docker image follow the same deployment pattern.
 """
 
+import json
+import logging
+import os
+from datetime import datetime, timezone
+
 from aws_cdk import (
     Stack,
     RemovalPolicy,
@@ -12,18 +20,23 @@
     aws_s3 as s3,
     aws_iam as iam,
     aws_s3_deployment as s3_deployment,
+    aws_ecr_assets as ecr_assets,
+    aws_ssm as ssm,
 )
 from constructs import Construct
 
+logger = logging.getLogger(__name__)
+
 
 class TemplateStorageStack(Stack):
-    """Stack for CloudFormation template storage."""
+    """Stack for CloudFormation template storage and agent image management."""
     
     def __init__(
         self,
         scope: Construct,
         construct_id: str,
         project_name: str,
+        build_agent_image: bool = True,
         **kwargs
     ) -> None:
         """
@@ -33,6 +46,7 @@ def __init__(
             scope: CDK scope
             construct_id: Stack ID
             project_name: Project name for resource naming
+            build_agent_image: Whether to build and push agent Docker image
             **kwargs: Additional stack arguments
         """
         # Add solution ID and description
@@ -59,24 +73,89 @@ def __init__(
             ]
         )
         
-        # Deploy generated CloudFormation template from CDK output directory to S3
-        # This automatically uploads the GenericAgentTemplate.json after synthesis
-        # Get the CDK output directory dynamically from the app
+        # Build and push agent Docker image during deployment (if enabled)
+        if build_agent_image:
+            # Create the Docker image asset - CDK will build and push to bootstrap ECR repo with SHA256 hash
+            self.agent_image_asset = self._create_agent_image_asset()
+            
+            # Get the ACTUAL CDK-generated image URI including:
+            # - Bootstrap ECR repository (cdk-hnb659fds-container-assets-ACCOUNT-REGION)
+            # - SHA256 hash tag (e.g., 9f846af5ece47ba5acecb3e381d65441b31f7f0fbd03f513e4f9aabd427c8659)
+            # This ensures each rebuild gets a unique hash, forcing ECS to pull the new image
+            docker_image_uri = self.agent_image_asset.image_uri
+            
+            logger.info(f"Docker image will be deployed to: {docker_image_uri}")
+            
+            # Store image URI in SSM Parameter Store for runtime use by Configuration API
+            # The deployment service will retrieve this at runtime, so we don't need to modify templates
+            ssm.StringParameter(
+                self,
+                "agent-image-uri-parameter",
+                parameter_name=f"/{project_name}/agent/image-uri",
+                string_value=docker_image_uri,
+                description="Docker image URI for agent containers (includes content hash for versioning)",
+                tier=ssm.ParameterTier.STANDARD
+            )
+        
+        # Deploy CloudFormation template to S3 (unmodified)
+        # The deployment service retrieves the image URI from SSM at runtime
+        self._deploy_template()
+    
+    def _deploy_template(self) -> s3_deployment.BucketDeployment:
+        """
+        Deploy the unmodified CloudFormation template to S3.
+        
+        The deployment service retrieves the actual image URI from SSM at runtime,
+        so we don't need to modify the template's ImageTag default value.
+        
+        Returns:
+            BucketDeployment for the template
+        """
+        # Get template path from CDK output directory
         app = self.node.root
         outdir = app.outdir if hasattr(app, 'outdir') else 'cdk.out'
         
-        s3_deployment.BucketDeployment(
+        logger.info(f"Deploying CloudFormation template from: {outdir}")
+        
+        # Deploy unmodified template to S3
+        return s3_deployment.BucketDeployment(
             self,
             "TemplateDeployment",
-            sources=[s3_deployment.Source.asset(outdir, exclude=["**", "!GenericAgentTemplate.json"])],
+            sources=[s3_deployment.Source.asset(outdir, exclude=["*", "!GenericAgentTemplate.json"])],
             destination_bucket=self.template_bucket,
-            prune=False,  # Keep old versions for rollback capability
-            retain_on_delete=False,  # Delete templates when stack is destroyed
+            prune=False,  # Don't delete other objects in bucket
+            retain_on_delete=False  # Clean up on stack deletion
         )
         
-        # Grant read access to Configuration API task role (will be added later)
-        # This is a placeholder - actual permissions added in Configuration API stack
+    def _create_agent_image_asset(self) -> ecr_assets.DockerImageAsset:
+        """
+        Create Docker image asset for the generic agent.
+        
+        CDK builds the image and pushes it to the bootstrap ECR repository
+        (cdk-hnb659fds-container-assets-ACCOUNT-REGION) with a SHA256 content hash as the tag.
         
+        This SHA256 hash changes with every code change, ensuring:
+        - Each rebuild gets a unique image identifier
+        - ECS tasks are forced to pull the new image on stack updates
+        - No manual tagging or versioning needed
+        
+        Returns:
+            DockerImageAsset for the agent image with CDK-generated SHA256 tag
+        """
+        return ecr_assets.DockerImageAsset(
+            self,
+            "AgentImageAsset",
+            directory="application_src",
+            file="multi-agent/agent-instance/Dockerfile",
+            platform=ecr_assets.Platform.LINUX_ARM64,
+            build_args={
+                "BUILD_TARGET": "production"
+            },
+            target="production"
+            # No explicit asset_name needed - CDK will use content hash
+        )
+    
+    
     def grant_read_access(self, grantee: iam.IGrantable) -> None:
         """
         Grant read access to the template bucket.

From 3791994b9cb2cfb2e04bcb570990b6d91e4c02ea Mon Sep 17 00:00:00 2001
From: Anuj Sharma <anshrma@amazon.com>
Date: Tue, 11 Nov 2025 15:04:29 -0800
Subject: [PATCH 2/7] Ensure required tags are present during CloudFormation
 stack updates

Add logic to guarantee ManagedBy and AgentName tags are present on
CloudFormation stacks during updates, with ManagedBy tag required
for proper deletion permissions.
---
 .../app/services/deployment_service.py        | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/application_src/configuration-api/app/services/deployment_service.py b/application_src/configuration-api/app/services/deployment_service.py
index 9eb3bd3..2cec1b9 100644
--- a/application_src/configuration-api/app/services/deployment_service.py
+++ b/application_src/configuration-api/app/services/deployment_service.py
@@ -728,9 +728,36 @@ async def update_agent_stack(
             update_params['Parameters'] = cfn_parameters
             logger.info(f"Final parameters for CloudFormation update (AgentName={agent_name} is first): {[p['ParameterKey'] for p in cfn_parameters]}")
             
-            # Add update tags
+            # Add update tags - ensure ManagedBy tag is always present
             existing_tags = stack_info.get('tags', [])
             update_tags = [tag for tag in existing_tags if not tag['Key'].startswith('aws:')]
+            
+            # Ensure ManagedBy tag is present (required for deletion permission)
+            has_managed_by_tag = any(tag['Key'] == 'ManagedBy' and tag['Value'] == 'ConfigurationAPI' 
+                                   for tag in update_tags)
+            
+            if not has_managed_by_tag:
+                logger.info(f"Adding missing ManagedBy tag to stack: {stack_name}")
+                update_tags.append({
+                    'Key': 'ManagedBy',
+                    'Value': 'ConfigurationAPI'
+                })
+            
+            # Ensure AgentName tag is present and matches the agent
+            agent_name_tag_index = None
+            for i, tag in enumerate(update_tags):
+                if tag['Key'] == 'AgentName':
+                    agent_name_tag_index = i
+                    break
+            
+            if agent_name_tag_index is not None:
+                update_tags[agent_name_tag_index]['Value'] = agent_name
+            else:
+                update_tags.append({
+                    'Key': 'AgentName',
+                    'Value': agent_name
+                })
+            
             update_tags.append({
                 'Key': 'LastUpdatedBy',
                 'Value': 'ConfigurationAPI'

From 37db6109fc45b1c50d07dce496da12388247bca5 Mon Sep 17 00:00:00 2001
From: Anuj Sharma <anshrma@amazon.com>
Date: Thu, 13 Nov 2025 11:15:45 -0800
Subject: [PATCH 3/7] feat(agent): enhance observability with Strands SDK
 integration

- Remove Langfuse-specific references in favor of generic observability
- Add Strands tracer import and configuration for provider-agnostic tracing
- Implement automatic Strands SDK observability setup in create_agent()
- Update agent execution to handle AgentResult objects and forward metrics
- Add error handling for observability configuration failures

This enables comprehensive observability across different providers while maintaining backward compatibility.
---
 application_src/common/agent.py               |  71 ++-
 .../common/observability/__init__.py          | 151 ++++++-
 application_src/common/observability/base.py  | 408 +++++++++++++++++-
 .../common/observability/datadog.py           | 388 +++++++++++++++++
 .../common/observability/dynatrace.py         | 179 +++++---
 .../common/observability/elastic.py           | 247 +++++------
 .../common/observability/langfuse.py          | 137 ++++--
 application_src/common/requirements.txt       |   1 +
 .../app/models/form_schema.py                 |  86 ++++
 .../app/services/bedrock_model_service.py     |  47 +-
 .../agent-instance/requirements.txt           |   2 +
 .../src/components/AgentWizard.js             |  19 +-
 12 files changed, 1496 insertions(+), 240 deletions(-)
 create mode 100644 application_src/common/observability/datadog.py

diff --git a/application_src/common/agent.py b/application_src/common/agent.py
index 25f7fdb..05bc6ab 100644
--- a/application_src/common/agent.py
+++ b/application_src/common/agent.py
@@ -15,7 +15,7 @@
 logger = get_logger(__name__)
 
 # CRITICAL: Initialize observability BEFORE importing Strands Agent
-# This ensures Langfuse environment variables are set before Strands Agent initialization
+# This ensures environment variables are set before Strands Agent initialization
 log_info(logger, "🔧 Initializing observability before Strands Agent import")
 from config import Config
 from observability import ObservabilityFactory
@@ -23,9 +23,10 @@
 # Note: Observability will be initialized per-agent, not at module level
 log_warning(logger, "⚠️ Observability will be initialized per-agent - Strands Agent will run without global tracing")
 
-# NOW import Strands Agent (after environment variables are set)
+# NOW import Strands Agent and tracer (after environment variables are set)
 from strands import Agent, tool
 from strands.models import BedrockModel
+from strands.telemetry.tracer import get_tracer
 from strands_tools import http_request
 
 # Import system prompt
@@ -401,6 +402,30 @@ def create_agent(prompt, user_id=None, agent_name="qa_agent"):
             else:
                 print("No knowledge base tools available")
         
+        # Configure Strands SDK observability (get_tracer + logging)
+        try:
+            observability_provider = ObservabilityFactory.get_current_provider()
+            if observability_provider:
+                service_name = observability_provider.trace_attributes.get("service.name", agent_name)
+                environment = observability_provider.trace_attributes.get("deployment.environment", "production")
+                
+                print(f"🔍 Configuring Strands SDK observability for {observability_provider.provider_name}...")
+                
+                # Configure Strands tracer with provider-specific settings
+                if hasattr(observability_provider, 'get_strands_tracer_config'):
+                    tracer_config = observability_provider.get_strands_tracer_config(service_name, environment)
+                    if tracer_config:
+                        print(f"📡 Configuring Strands get_tracer with {observability_provider.provider_name} settings...")
+                        tracer = get_tracer(**tracer_config)
+                        print(f"✅ Strands tracer configured for {observability_provider.provider_name}")
+                
+                # Configure Strands logging with provider-specific settings  
+                if hasattr(observability_provider, 'configure_strands_logging'):
+                    observability_provider.configure_strands_logging(service_name, environment)
+                
+        except Exception as obs_error:
+            print(f"⚠️ Error configuring Strands SDK observability: {obs_error}")
+        
         # Get trace attributes for observability
         trace_attributes = get_trace_attributes(agent_name)
         
@@ -453,7 +478,26 @@ def run_agent(prompt: str, user_id=None, agent_name="qa_agent"):
             return "Error: Failed to create agent"
         
         # Run the agent
-        response = agent(prompt)  # Call the agent directly instead of using .run()
+        agent_result = agent(prompt)  # Call the agent directly and get AgentResult
+        
+        # Extract response from AgentResult
+        response = str(agent_result)  # AgentResult can be converted to string for response
+        
+        # Send Strands metrics to observability provider if configured
+        try:
+            observability_provider = ObservabilityFactory.get_current_provider()
+            if observability_provider and hasattr(observability_provider, 'process_strands_metrics'):
+                service_name = observability_provider.trace_attributes.get("service.name", agent_name)
+                environment = observability_provider.trace_attributes.get("deployment.environment", "production")
+                
+                print(f"📊 Forwarding Strands metrics to {observability_provider.provider_name}...")
+                observability_provider.process_strands_metrics(
+                    agent_result, service_name, environment
+                )
+            else:
+                print("ℹ️ No observability provider configured for metrics")
+        except Exception as metrics_error:
+            print(f"⚠️ Failed to process Strands metrics: {metrics_error}")
         
         # Get memory configuration from the specified agent
         from config import Config
@@ -640,6 +684,27 @@ async def run_agent_and_stream_response(prompt: str, user_id=None, agent_name="q
         
         print(f"✅ Streaming completed. Total response length: {len(response)}")
         
+        # Send Strands metrics to observability provider if configured (after streaming completes)
+        try:
+            observability_provider = ObservabilityFactory.get_current_provider()
+            if observability_provider and hasattr(observability_provider, 'process_strands_metrics'):
+                # Get the final AgentResult from the completed streaming
+                final_result = agent.last_result if hasattr(agent, 'last_result') else None
+                if final_result:
+                    service_name = observability_provider.trace_attributes.get("service.name", agent_name)
+                    environment = observability_provider.trace_attributes.get("deployment.environment", "production")
+                    
+                    print(f"📊 Forwarding Strands streaming metrics to {observability_provider.provider_name}...")
+                    observability_provider.process_strands_metrics(
+                        final_result, service_name, environment
+                    )
+                else:
+                    print("ℹ️ No AgentResult available for metrics processing")
+            else:
+                print("ℹ️ No observability provider configured for metrics")
+        except Exception as metrics_error:
+            print(f"⚠️ Failed to process Strands streaming metrics: {metrics_error}")
+        
         # Store the response in memory if memory is enabled
         if memory_enabled and mem0_module and hasattr(mem0_module, 'mem0_memory'):
             try:
diff --git a/application_src/common/observability/__init__.py b/application_src/common/observability/__init__.py
index 62024d0..00177b7 100644
--- a/application_src/common/observability/__init__.py
+++ b/application_src/common/observability/__init__.py
@@ -10,12 +10,104 @@
 from .langfuse import LangfuseObservabilityProvider
 from .dynatrace import DynatraceObservabilityProvider
 from .elastic import ElasticObservabilityProvider
+from .datadog import DatadogObservabilityProvider
 
 logger = logging.getLogger(__name__)
 
 
 class ObservabilityFactory:
-    """Factory for creating observability providers."""
+    """Factory for creating observability providers with proper transition handling."""
+    
+    # Supported observability provider mappings
+    _SUPPORTED_PROVIDERS = {
+        "langfuse": LangfuseObservabilityProvider,
+        "dynatrace": DynatraceObservabilityProvider,
+        "elastic": ElasticObservabilityProvider,
+        "datadog": DatadogObservabilityProvider,
+    }
+    
+    # Track active provider for proper cleanup during transitions
+    _active_provider = None
+    _active_provider_name = None
+    
+    @classmethod
+    def get_supported_providers(cls) -> list[str]:
+        """Get list of supported observability providers."""
+        return list(cls._SUPPORTED_PROVIDERS.keys())
+    
+    @classmethod
+    def cleanup_previous_provider(cls):
+        """Clean up the previously active provider to prevent conflicts."""
+        if cls._active_provider and hasattr(cls._active_provider, 'cleanup'):
+            try:
+                logger.info(f"Cleaning up previous observability provider: {cls._active_provider_name}")
+                cls._active_provider.cleanup()
+            except Exception as e:
+                logger.warning(f"Error cleaning up previous provider {cls._active_provider_name}: {e}")
+        
+        cls._active_provider = None
+        cls._active_provider_name = None
+    
+    @classmethod
+    def is_provider_active(cls, provider_name: str) -> bool:
+        """Check if a specific provider is currently the active provider."""
+        return cls._active_provider_name == provider_name.lower()
+    
+    @classmethod
+    def get_active_provider_name(cls) -> str | None:
+        """Get the name of the currently active provider."""
+        return cls._active_provider_name
+    
+    @classmethod
+    def force_cleanup_all_providers(cls):
+        """Force cleanup of all provider environment variables and global state."""
+        logger.info("Forcing cleanup of all observability provider configurations")
+        
+        # Clean up all known provider environment variables
+        import os
+        all_provider_env_vars = [
+            # Datadog variables
+            "DD_API_KEY", "DD_SITE", "DD_ENV", "DD_SERVICE", "DD_VERSION",
+            "DD_TRACE_AGENT_URL", "DD_TRACE_API_VERSION", "DD_AGENT_HOST",
+            "DD_DOGSTATSD_PORT", "DD_APM_DD_URL", "DD_LLMOBS_INTAKE_URL",
+            "DD_LOGS_INJECTION", "DD_LLMOBS_ENABLED", "DD_LLMOBS_ML_APP",
+            "DD_LLMOBS_AGENTLESS_ENABLED", "DD_TRACE_TLS_CERT_FILE",
+            "DD_TRACE_TLS_CA_CERT", "DD_TRACE_TLS_VERIFY", "DD_LLMOBS_TLS_VERIFY",
+            "DD_TRACE_WRITER_BUFFER_SIZE_BYTES", "DD_TRACE_WRITER_MAX_PAYLOAD_SIZE",
+            "DD_TRACE_WRITER_INTERVAL_SECONDS", "DATADOG_METRICS_ENABLED",
+            "DATADOG_SERVICE_NAME", "DATADOG_ENVIRONMENT",
+            
+            # Langfuse variables
+            "LANGFUSE_PUBLIC_KEY", "LANGFUSE_SECRET_KEY", "LANGFUSE_HOST",
+            
+            # Elastic variables
+            "ELASTIC_API_KEY", "OTEL_EXPORTER_OTLP_ENDPOINT", "OTEL_EXPORTER_OTLP_HEADERS",
+            
+            # Dynatrace variables
+            "DT_TOKEN", "OTLP_ENDPOINT",
+            
+            # Common variables
+            "PROJECT_NAME"
+        ]
+        
+        removed_count = 0
+        for env_var in all_provider_env_vars:
+            if env_var in os.environ:
+                del os.environ[env_var]
+                removed_count += 1
+        
+        logger.info(f"Removed {removed_count} provider environment variables")
+        
+        # Reset OpenTelemetry to clean state
+        try:
+            from opentelemetry import trace
+            trace.set_tracer_provider(trace.NoOpTracerProvider())
+            logger.debug("Reset OpenTelemetry tracer provider")
+        except ImportError:
+            pass
+        
+        cls._active_provider = None
+        cls._active_provider_name = None
     
     @staticmethod
     def create(agent_name="qa_agent"):
@@ -39,29 +131,62 @@ def create(agent_name="qa_agent"):
             return None
         
         provider = provider.lower()
+        
+        # Check if we're switching providers and need cleanup
+        if (ObservabilityFactory._active_provider_name and 
+            ObservabilityFactory._active_provider_name != provider):
+            logger.info(f"Provider transition detected: {ObservabilityFactory._active_provider_name} → {provider}")
+            ObservabilityFactory.cleanup_previous_provider()
+        
         logger.info(f"Creating observability provider: {provider}")
         
-        if provider == "langfuse":
-            logger.debug("Creating Langfuse observability provider")
-            return LangfuseObservabilityProvider(obs_config)
-        elif provider == "dynatrace":
-            logger.debug("Creating Dynatrace observability provider")
-            return DynatraceObservabilityProvider(obs_config)
-        elif provider == "elastic":
-            logger.debug("Creating Elastic observability provider")
-            return ElasticObservabilityProvider(obs_config)
+        provider_class = ObservabilityFactory._SUPPORTED_PROVIDERS.get(provider)
+        if provider_class:
+            logger.debug(f"Creating {provider.title()} observability provider")
+            new_provider = provider_class(obs_config)
+            
+            # Track the new active provider
+            ObservabilityFactory._active_provider = new_provider
+            ObservabilityFactory._active_provider_name = provider
+            
+            return new_provider
         else:
-            logger.error(f"Unknown observability provider: {provider}")
+            supported_providers = ", ".join(ObservabilityFactory._SUPPORTED_PROVIDERS.keys())
+            logger.error(f"Unknown observability provider: {provider}. Supported providers: {supported_providers}")
             return None
 
 
 def get_trace_attributes(agent_name="qa_agent"):
-    """Get trace attributes for use with Strands Agent."""
+    """Get trace attributes for use with Strands Agent - validates current configuration."""
     logger.debug(f"Getting trace attributes for agent: {agent_name}")
+    
+    # CRITICAL: Always validate against current configuration, not cached provider
+    agent_config = Config(agent_name)
+    obs_config = agent_config.get_observability_config()
+    
+    # Strict validation - only proceed if observability is enabled and provider is specified
+    if not obs_config.get("enabled", False):
+        logger.debug("Observability is disabled in configuration")
+        return {}
+    
+    current_provider = obs_config.get("provider")
+    if not current_provider:
+        logger.warning("No observability provider specified in current configuration")
+        return {}
+    
+    current_provider = current_provider.lower()
+    
+    # Additional safeguard: check if cached provider matches current config
+    if (ObservabilityFactory._active_provider_name and 
+        ObservabilityFactory._active_provider_name != current_provider):
+        logger.warning(f"Active provider ({ObservabilityFactory._active_provider_name}) doesn't match current config ({current_provider})")
+        ObservabilityFactory.force_cleanup_all_providers()
+    
+    # Create provider only if configuration is valid and enabled
     obs_provider = ObservabilityFactory.create(agent_name)
     if obs_provider:
         trace_attrs = obs_provider.get_trace_attributes()
-        logger.debug(f"Trace attributes retrieved: {trace_attrs}")
+        logger.debug(f"Trace attributes retrieved for {current_provider}: {trace_attrs}")
         return trace_attrs
     else:
         logger.warning("No observability provider available")
diff --git a/application_src/common/observability/base.py b/application_src/common/observability/base.py
index 0834738..9e01bcd 100644
--- a/application_src/common/observability/base.py
+++ b/application_src/common/observability/base.py
@@ -5,9 +5,11 @@
 
 from abc import ABC, abstractmethod
 from typing import Dict, Any
+import logging
+import os
 
 class BaseObservabilityProvider(ABC):
-    """Base class for observability providers."""
+    """Base class for observability providers with common Strands SDK integration."""
     
     def __init__(self, config: Dict[str, Any]):
         """Initialize the observability provider."""
@@ -16,17 +18,419 @@ def __init__(self, config: Dict[str, Any]):
         self.provider_details = config.get("provider_details", [])
         self.trace_attributes = {}
     
+    def _validate_provider_is_active(self) -> bool:
+        """Validate that this provider is currently the active provider."""
+        # Import here to avoid circular imports
+        from . import ObservabilityFactory
+        
+        if not ObservabilityFactory.is_provider_active(self.provider_name):
+            logging.warning(f"{self.provider_name} provider attempting to operate but is not active. Current active: {ObservabilityFactory.get_active_provider_name()}")
+            return False
+        return True
+    
+    def _validate_active_or_skip(func):
+        """Decorator to validate provider is active before execution."""
+        def wrapper(self, *args, **kwargs):
+            if not self._validate_provider_is_active():
+                logging.debug(f"Skipping {func.__name__} for inactive {self.provider_name} provider")
+                return {} if func.__name__.startswith('get_') else None
+            return func(self, *args, **kwargs)
+        return wrapper
+    
+    def _get_service_info(self) -> tuple[str, str]:
+        """Common service name and version resolution logic."""
+        service_name = (
+            self.config.get("agent_name") or 
+            os.environ.get('AGENT_NAME') or 
+            os.environ.get('SERVICE_NAME') or 
+            'genai-in-a-box'
+        )
+        
+        service_version = (
+            self.config.get("agent_version") or 
+            os.environ.get('SERVICE_VERSION') or 
+            '1.0.0'
+        )
+        
+        return service_name, service_version
+    
+    def _normalize_otlp_endpoint(self, endpoint: str, path: str = '/v1/traces') -> str:
+        """Normalize OTLP endpoint to ensure correct path."""
+        if not endpoint.endswith(path):
+            if endpoint.endswith('/'):
+                endpoint = endpoint + path.lstrip('/')
+            else:
+                endpoint = endpoint + path
+        return endpoint
+    
+    def _cleanup_environment_variables_by_list(self, env_var_list: list[str]) -> int:
+        """Common environment variable cleanup logic."""
+        import os
+        removed_count = 0
+        for env_var in env_var_list:
+            if env_var in os.environ:
+                del os.environ[env_var]
+                removed_count += 1
+        return removed_count
+    
     @abstractmethod
     def initialize(self) -> Dict[str, Any]:
         """Initialize the observability provider and get the trace attributes."""
         pass
     
+    def process_strands_metrics(self, agent_result, service_name: str, environment: str):
+        """
+        Process metrics from Strands AgentResult using common extraction + provider-specific sending.
+        
+        Args:
+            agent_result: The AgentResult object returned by Strands Agent
+            service_name: Service name for tagging/identification
+            environment: Environment name for tagging/identification
+        """
+        # CRITICAL: Only process if this provider is currently active
+        if not self._validate_provider_is_active():
+            logging.debug(f"Skipping metrics processing for inactive {self.provider_name} provider")
+            return
+            
+        try:
+            if not hasattr(agent_result, 'metrics'):
+                print("ℹ️ No metrics found in AgentResult")
+                return
+            
+            print(f"📊 Processing Strands metrics for {self.provider_name}...")
+            
+            # Extract metrics using common logic
+            metrics_data = self._extract_strands_metrics(agent_result.metrics, service_name, environment)
+            
+            # Send using provider-specific implementation
+            self._send_metrics(metrics_data, service_name, environment)
+            
+            print(f"🎯 Strands metrics successfully forwarded to {self.provider_name}!")
+            
+        except Exception as e:
+            print(f"⚠️ Error processing Strands metrics for {self.provider_name}: {e}")
+            import traceback
+            traceback.print_exc()
+    
+    def configure_strands_logging(self, service_name: str, environment: str):
+        """Configure Strands SDK logging using common handler + provider-specific emission."""
+        # CRITICAL: Only configure if this provider is currently active
+        if not self._validate_provider_is_active():
+            logging.debug(f"Skipping logging configuration for inactive {self.provider_name} provider")
+            return
+            
+        try:
+            print(f"📝 Configuring Strands SDK logging for {self.provider_name}...")
+            
+            # Create common handler with provider-specific emission
+            handler = self._create_strands_log_handler(service_name, environment)
+            
+            # Add handler to Strands root logger
+            strands_logger = logging.getLogger("strands")
+            strands_logger.addHandler(handler)
+            strands_logger.setLevel(logging.INFO)
+            
+            print(f"✅ Strands SDK logging configured for {self.provider_name}")
+            
+        except Exception as e:
+            print(f"⚠️ Error configuring Strands logging for {self.provider_name}: {e}")
+    
+    @abstractmethod
+    def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get configuration for Strands get_tracer() - provider-specific."""
+        pass
+    
+    def _build_standard_tracer_config(self, service_name: str, environment: str, 
+                                     endpoint: str, headers: dict[str, str],
+                                     additional_attributes: dict[str, Any] = None) -> dict[str, Any]:
+        """Build standard tracer configuration with common patterns."""
+        service_name, service_version = self._get_service_info()
+        
+        base_config = {
+            "service_name": service_name,
+            "otlp_endpoint": endpoint,
+            "headers": headers,
+            "enable_console_export": False,
+            "resource_attributes": {
+                "service.name": service_name,
+                "service.version": service_version,
+                "deployment.environment": environment
+            }
+        }
+        
+        # Add provider-specific attributes
+        if additional_attributes:
+            base_config["resource_attributes"].update(additional_attributes)
+            
+        return base_config
+    
+    def _create_standard_trace_attributes(self, additional_tags: list[str] = None) -> dict[str, Any]:
+        """Create standard trace attributes with common patterns."""
+        import uuid
+        service_name, service_version = self._get_service_info()
+        project_name = os.environ.get('PROJECT_NAME', 'genai-box')
+        
+        base_attributes = {
+            "session.id": f"{service_name}-session-{uuid.uuid4()}",
+            "user.id": f"{service_name}-user",
+            "service.name": service_name,
+            "service.version": service_version,
+            "deployment.environment": os.environ.get('ENVIRONMENT', 'production')
+        }
+        
+        # Add provider-specific tags
+        if additional_tags:
+            base_attributes[f"{self.provider_name}.tags"] = [
+                project_name, "Strands-Agent", "Production"
+            ] + additional_tags
+        
+        return base_attributes
+    
+    # Common helper methods (DRY principle)
+    def _extract_strands_metrics(self, metrics, service_name: str, environment: str) -> Dict[str, Any]:
+        """Extract metrics data from Strands metrics object - common logic."""
+        metrics_data = {
+            "service_name": service_name,
+            "environment": environment,
+            "tokens": {},
+            "performance": {},
+            "cycles": {},
+            "tools": {}
+        }
+        
+        # Extract token usage metrics
+        if hasattr(metrics, 'accumulated_usage'):
+            usage = metrics.accumulated_usage
+            metrics_data["tokens"] = {
+                "input": usage.get('inputTokens', 0),
+                "output": usage.get('outputTokens', 0),
+                "total": usage.get('totalTokens', 0)
+            }
+        
+        # Extract performance metrics
+        if hasattr(metrics, 'accumulated_metrics'):
+            perf = metrics.accumulated_metrics
+            if 'latencyMs' in perf:
+                metrics_data["performance"]["latency_ms"] = perf['latencyMs']
+        
+        # Extract cycle metrics
+        if hasattr(metrics, 'cycle_count'):
+            metrics_data["cycles"]["count"] = metrics.cycle_count
+            
+        if hasattr(metrics, 'cycle_durations') and metrics.cycle_durations:
+            metrics_data["cycles"]["total_duration"] = sum(metrics.cycle_durations)
+            metrics_data["cycles"]["average_duration"] = sum(metrics.cycle_durations) / len(metrics.cycle_durations)
+        
+        # Extract tool metrics
+        if hasattr(metrics, 'tool_metrics'):
+            for tool_name, tool_metrics in metrics.tool_metrics.items():
+                success_rate = tool_metrics.success_count / max(tool_metrics.call_count, 1)
+                metrics_data["tools"][tool_name] = {
+                    "call_count": tool_metrics.call_count,
+                    "success_count": tool_metrics.success_count,
+                    "error_count": tool_metrics.error_count,
+                    "total_time": tool_metrics.total_time,
+                    "success_rate": success_rate
+                }
+        
+        return metrics_data
+    
+    def _create_strands_log_handler(self, service_name: str, environment: str):
+        """Create common Strands log handler with provider-specific emission."""
+        class StrandsLogHandler(logging.Handler):
+            def __init__(self, provider_instance, service_name, environment):
+                super().__init__()
+                self.provider = provider_instance
+                self.service_name = service_name
+                self.environment = environment
+                self.setLevel(logging.INFO)
+            
+            def emit(self, record):
+                try:
+                    # CRITICAL: Validate provider is still active before emitting logs
+                    if not self.provider._validate_provider_is_active():
+                        return  # Silently skip if provider is no longer active
+                        
+                    if record.name.startswith('strands'):
+                        log_data = {
+                            "message": self.format(record),
+                            "level": record.levelname,
+                            "logger": record.name,
+                            "service": self.service_name,
+                            "environment": self.environment
+                        }
+                        # Delegate to provider-specific implementation
+                        self.provider._emit_log(log_data)
+                except:
+                    pass
+        
+        return StrandsLogHandler(self, service_name, environment)
+    
+    def _send_metrics(self, metrics_data: Dict[str, Any], service_name: str, environment: str):
+        """Send extracted metrics data using common pattern + provider config."""
+        try:
+            client_config = self._get_metrics_client_config(service_name, environment)
+            if not client_config:
+                print(f"⚠️ No metrics client config for {self.provider_name}")
+                return
+                
+            # Use provider-specific client to send metrics
+            self._send_metrics_with_client(metrics_data, client_config)
+            
+        except Exception as e:
+            print(f"⚠️ Error sending metrics to {self.provider_name}: {e}")
+    
+    def _emit_log(self, log_data: Dict[str, Any]):
+        """Emit log data using common pattern + provider config."""
+        try:
+            client_config = self._get_log_client_config()
+            if not client_config:
+                print(f"⚠️ No log client config for {self.provider_name}")
+                return
+                
+            # Use provider-specific client to emit log
+            self._emit_log_with_client(log_data, client_config)
+            
+        except Exception as e:
+            print(f"⚠️ Error emitting log to {self.provider_name}: {e}")
+    
+    @abstractmethod
+    def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get metrics client configuration - provider defines connection details only."""
+        pass
+    
+    @abstractmethod 
+    def _get_log_client_config(self) -> Dict[str, Any]:
+        """Get log client configuration - provider defines connection details only."""
+        pass
+    
+    @abstractmethod
+    def _send_metrics_with_client(self, metrics_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Send metrics using provider-specific client - minimal implementation required."""
+        pass
+    
+    @abstractmethod
+    def _emit_log_with_client(self, log_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Emit log using provider-specific client - minimal implementation required."""
+        pass
+    
     def get_trace_attributes(self) -> Dict[str, Any]:
-        """Get the trace attributes."""
+        """Get the trace attributes only if this provider is currently active."""
+        # CRITICAL: Only return trace attributes if this provider is currently active
+        if not self._validate_provider_is_active():
+            logging.debug(f"Skipping trace attributes for inactive {self.provider_name} provider")
+            return {}
+            
         if not self.trace_attributes:
             return self.initialize()
         return self.trace_attributes
     
+    def _initialize_opentelemetry_common(self, otlp_config: Dict[str, Any]):
+        """Common OpenTelemetry initialization - eliminates duplication across providers."""
+        try:
+            from opentelemetry import trace
+            from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+            from opentelemetry.sdk.trace import TracerProvider
+            from opentelemetry.sdk.trace.export import BatchSpanProcessor
+            from opentelemetry.sdk.resources import Resource
+            
+            print("📦 OpenTelemetry packages imported successfully")
+            
+            # Create resource with service information
+            resource = Resource.create(otlp_config.get("resource_attributes", {}))
+            
+            # Set up tracer provider
+            tracer_provider = TracerProvider(resource=resource)
+            trace.set_tracer_provider(tracer_provider)
+            
+            print("🔧 TracerProvider configured")
+            
+            # Create OTLP exporter with provider-specific config
+            otlp_exporter = OTLPSpanExporter(
+                endpoint=otlp_config["endpoint"],
+                headers=otlp_config.get("headers", {})
+            )
+            
+            # Add span processor
+            span_processor = BatchSpanProcessor(otlp_exporter)
+            tracer_provider.add_span_processor(span_processor)
+            
+            print(f"✅ OpenTelemetry configured for {self.provider_name}")
+            
+        except ImportError as import_error:
+            print(f"❌ Missing OpenTelemetry dependencies: {import_error}")
+            print("   Install with: pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp")
+            raise
+        except Exception as setup_error:
+            print(f"❌ OpenTelemetry setup failed for {self.provider_name}: {setup_error}")
+            import traceback
+            traceback.print_exc()
+            raise
+    
+    def cleanup(self):
+        """Clean up the observability provider to prevent conflicts during transitions."""
+        try:
+            logging.info(f"Cleaning up {self.provider_name} observability provider")
+            
+            # Clean up environment variables specific to this provider
+            self._cleanup_environment_variables()
+            
+            # Clean up OpenTelemetry configuration
+            self._cleanup_opentelemetry()
+            
+            # Clean up logging handlers
+            self._cleanup_logging()
+            
+            # Provider-specific cleanup
+            self._provider_specific_cleanup()
+            
+            logging.info(f"✅ Cleaned up {self.provider_name} observability provider")
+            
+        except Exception as e:
+            logging.warning(f"Error during {self.provider_name} cleanup: {e}")
+    
+    def _cleanup_environment_variables(self):
+        """Clean up provider-specific environment variables."""
+        # Override in subclasses to clean specific env vars
+        pass
+    
+    def _cleanup_opentelemetry(self):
+        """Clean up OpenTelemetry global state."""
+        try:
+            from opentelemetry import trace
+            # Reset tracer provider to default
+            trace.set_tracer_provider(trace.NoOpTracerProvider())
+            logging.debug("OpenTelemetry tracer provider reset")
+        except ImportError:
+            pass  # OpenTelemetry not installed
+        except Exception as e:
+            logging.debug(f"Error resetting OpenTelemetry: {e}")
+    
+    def _cleanup_logging(self):
+        """Clean up logging handlers that this provider added."""
+        try:
+            # Remove handlers from Strands logger that this provider added
+            strands_logger = logging.getLogger("strands")
+            handlers_to_remove = []
+            
+            for handler in strands_logger.handlers:
+                # Check if this handler belongs to our provider
+                if hasattr(handler, 'provider') and handler.provider == self:
+                    handlers_to_remove.append(handler)
+            
+            for handler in handlers_to_remove:
+                strands_logger.removeHandler(handler)
+                handler.close()
+            
+            logging.debug(f"Removed {len(handlers_to_remove)} logging handlers")
+            
+        except Exception as e:
+            logging.debug(f"Error cleaning up logging handlers: {e}")
+    
+    def _provider_specific_cleanup(self):
+        """Override in subclasses for provider-specific cleanup."""
+        pass
+    
     def get_provider_config(self) -> Dict[str, Any]:
         """Get the provider configuration."""
         for provider in self.provider_details:
diff --git a/application_src/common/observability/datadog.py b/application_src/common/observability/datadog.py
new file mode 100644
index 0000000..9a5b0e7
--- /dev/null
+++ b/application_src/common/observability/datadog.py
@@ -0,0 +1,388 @@
+"""
+Datadog observability provider for GenAI-In-A-Box agent.
+This module provides comprehensive Datadog instrumentation using the official ddtrace library.
+Supports traces, logs, metrics, and specialized LLM observability.
+"""
+
+import logging
+import os
+import uuid
+from typing import Dict, Any
+from .base import BaseObservabilityProvider
+
+logger = logging.getLogger(__name__)
+
+
+class DatadogObservabilityProvider(BaseObservabilityProvider):
+    """Official Datadog observability provider using ddtrace library."""
+    
+    def __init__(self, config: Dict[str, Any]):
+        """Initialize the Datadog observability provider."""
+        super().__init__(config)
+        self.provider_name = "datadog"
+    
+    def initialize(self) -> Dict[str, Any]:
+        """Initialize the Datadog observability provider using official ddtrace library."""
+        try:
+            provider_config = self.get_provider_config()
+            
+            print(f"🔍 Datadog provider config: {provider_config}")
+            
+            # Get Datadog configuration
+            api_key = provider_config.get("api_key", "")
+            site = provider_config.get("site", "datadoghq.com")
+            environment = provider_config.get("environment", "production")
+            service_name, version = self._get_service_info()
+            # Override version from config if provided
+            version = provider_config.get("version", version)
+            enable_llm_obs = provider_config.get("enable_llm_obs", True)
+            enable_logs = provider_config.get("enable_logs", True)
+            
+            print(f"🔑 Datadog configuration:")
+            print(f"   API Key: {'✅ Present' if api_key else '❌ Missing'}")
+            print(f"   Site: {site}")
+            print(f"   Environment: {environment}")
+            print(f"   Service: {service_name}")
+            print(f"   Version: {version}")
+            print(f"   LLM Observability: {enable_llm_obs}")
+            print(f"   Logs: {enable_logs}")
+            
+            if not api_key:
+                print("❌ Error: Datadog API key is required")
+                return {}
+            
+            # Set up environment variables for ddtrace
+            os.environ["DD_API_KEY"] = api_key
+            os.environ["DD_SITE"] = site
+            os.environ["DD_ENV"] = environment
+            os.environ["DD_SERVICE"] = service_name
+            os.environ["DD_VERSION"] = version
+            
+            # Force agentless mode for ECS deployment - Official Datadog approach
+            os.environ["DD_TRACE_AGENT_URL"] = f"https://trace.agent.{site}"  # Direct intake URL base
+            os.environ["DD_TRACE_API_VERSION"] = "v0.4"  # Use supported API version
+            os.environ["DD_AGENT_HOST"] = ""  # Disable local agent connection
+            os.environ["DD_DOGSTATSD_PORT"] = "0"  # Disable StatsD  
+            os.environ["DD_APM_DD_URL"] = f"https://trace.agent.{site}"  # APM intake URL
+            os.environ["DD_LLMOBS_INTAKE_URL"] = f"https://llmobs-intake.{site}"  # LLM intake URL
+            print(f"🌐 Configured direct Datadog intake URLs for site: {site}")
+            print(f"🔧 Using v0.4 traces API (stable supported version)")
+            
+            # Configure logs
+            if enable_logs:
+                os.environ["DD_LOGS_INJECTION"] = "true"
+                print("✅ Log correlation enabled")
+            else:
+                os.environ["DD_LOGS_INJECTION"] = "false"
+                print("ℹ️ Log correlation disabled")
+            
+            # Configure LLM Observability
+            if enable_llm_obs:
+                os.environ["DD_LLMOBS_ENABLED"] = "1"
+                os.environ["DD_LLMOBS_ML_APP"] = service_name
+                os.environ["DD_LLMOBS_AGENTLESS_ENABLED"] = "1"  # Required for ECS without agent
+                print("✅ LLM Observability enabled")
+            else:
+                os.environ["DD_LLMOBS_ENABLED"] = "0"
+                print("ℹ️ LLM Observability disabled")
+            
+            # Fix SSL certificate verification issues in containerized environments
+            os.environ["DD_TRACE_TLS_CERT_FILE"] = ""      # Clear TLS cert file
+            os.environ["DD_TRACE_TLS_CA_CERT"] = ""        # Clear CA cert
+            os.environ["DD_TRACE_TLS_VERIFY"] = "false"    # Disable TLS verification
+            os.environ["DD_LLMOBS_TLS_VERIFY"] = "false"   # Disable LLMObs TLS verification
+            print("⚠️ TLS verification disabled for containerized environment")
+            
+            # Force direct API submission (bypass agent completely)
+            os.environ["DD_TRACE_WRITER_BUFFER_SIZE_BYTES"] = "1048576"  # 1MB buffer
+            os.environ["DD_TRACE_WRITER_MAX_PAYLOAD_SIZE"] = "1000000"   # 1MB max payload
+            os.environ["DD_TRACE_WRITER_INTERVAL_SECONDS"] = "1"         # Send every 1 second
+            
+            print(f"✅ Datadog environment variables configured")
+            
+            # Initialize ddtrace programmatically
+            try:
+                print("📦 Importing ddtrace library...")
+                
+                # AGGRESSIVE SSL FIX: Modify Python SSL context globally
+                import ssl
+                ssl._create_default_https_context = ssl._create_unverified_context
+                print("🔧 Disabled SSL verification at Python SSL context level")
+                
+                # Initialize LLM Observability if enabled
+                if enable_llm_obs:
+                    print("🤖 Initializing LLM Observability...")
+                    from ddtrace.llmobs import LLMObs
+                    
+                    LLMObs.enable(
+                        ml_app=service_name,
+                        site=site,
+                        api_key=api_key,
+                        agentless_enabled=True,
+                        env=environment,
+                        service=service_name,
+                        integrations_enabled=True  # Enable automatic LLM instrumentation
+                    )
+                    print("✅ LLM Observability initialized")
+                
+                # Enable automatic instrumentation for LLM libraries only
+                print("🔧 Enabling LLM-specific instrumentation...")
+                from ddtrace import patch
+                # Only patch LLM-related libraries, not all libraries
+                patch(anthropic=True, botocore=True, openai=True, langchain=True)
+                print("✅ LLM-specific instrumentation enabled")
+                
+                # Configure logging integration (trace correlation + Strands logs)
+                if enable_logs:
+                    print("📝 Configuring logging integration...")
+                    try:
+                        from ddtrace.contrib.logging import patch as patch_logging
+                        patch_logging()
+                        
+                        # Add custom handler for Strands SDK logs
+                        import logging
+                        from datadog import api
+                        
+                        class StrandsDatadogLogHandler(logging.Handler):
+                            def __init__(self, service_name, environment):
+                                super().__init__()
+                                self.service_name = service_name
+                                self.environment = environment
+                                self.setLevel(logging.INFO)  # Only forward INFO and above
+                            
+                            def emit(self, record):
+                                try:
+                                    # Only process Strands SDK logs
+                                    if record.name.startswith('strands'):
+                                        log_entry = {
+                                            "message": self.format(record),
+                                            "level": record.levelname.lower(),
+                                            "logger": record.name,
+                                            "service": self.service_name,
+                                            "environment": self.environment,
+                                            "tags": [f"env:{self.environment}", f"service:{self.service_name}", f"logger:{record.name}"]
+                                        }
+                                        # Send to Datadog Logs API (fire and forget)
+                                        try:
+                                            api.Log.create(**log_entry)
+                                        except:
+                                            pass  # Don't break on log submission failures
+                                except:
+                                    pass  # Don't break application if logging fails
+                        
+                        # Add handler to Strands root logger
+                        strands_logger = logging.getLogger("strands")
+                        strands_handler = StrandsDatadogLogHandler(service_name, environment)
+                        strands_logger.addHandler(strands_handler)
+                        strands_logger.setLevel(logging.INFO)  # Enable INFO level for Strands
+                        
+                        print("✅ Strands SDK logging forwarded to Datadog")
+                        print("✅ Logging integration patched for trace correlation")
+                    except (ImportError, AttributeError):
+                        print("ℹ️ Using environment variables for log correlation only")
+                
+                # Configure metrics for Strands integration
+                print("📊 Configuring metrics integration...")
+                try:
+                    from datadog import initialize, statsd
+                    
+                    # Initialize Datadog API for metrics
+                    initialize(
+                        api_key=api_key,
+                        host_name=f"api.{site}",
+                        http_host=f"api.{site}",
+                        secure=True
+                    )
+                    
+                    # Configure StatsD for metrics
+                    statsd.host = f"api.{site}"
+                    statsd.port = 443
+                    statsd.use_ms = True
+                    
+                    # Send a test metric to verify connection
+                    statsd.increment(f"{service_name}.startup", tags=[f"env:{environment}", f"service:{service_name}"])
+                    print(f"✅ Sent test metric: {service_name}.startup")
+                    
+                    # Store for Strands integration
+                    os.environ["DATADOG_METRICS_ENABLED"] = "true"
+                    os.environ["DATADOG_SERVICE_NAME"] = service_name
+                    os.environ["DATADOG_ENVIRONMENT"] = environment
+                    
+                    print("✅ Datadog metrics client configured")
+                    
+                except Exception as metrics_error:
+                    print(f"⚠️ Metrics configuration failed: {metrics_error}")
+                    print("   Traces and LLM Observability will still work")
+                
+                print("🚀 Datadog ddtrace initialized successfully")
+                
+            except ImportError as e:
+                print(f"❌ ddtrace library not available: {e}")
+                print("   Install with: pip install ddtrace")
+                print("   Falling back to environment variables only")
+            except Exception as e:
+                print(f"⚠️ ddtrace initialization failed: {e}")
+                print("   Environment variables are set, some functionality may still work")
+                import traceback
+                traceback.print_exc()
+            
+            # Parse custom tags if provided
+            tags = provider_config.get("tags", "")
+            parsed_tags = []
+            if tags and isinstance(tags, str):
+                for line in tags.strip().split('\n'):
+                    if line.strip():
+                        parsed_tags.append(line.strip())
+            elif tags and isinstance(tags, list):
+                parsed_tags = tags
+            
+            # Build default tags
+            default_tags = [f"service:{service_name}", f"env:{environment}"]
+            all_tags = default_tags + parsed_tags
+            
+            self.trace_attributes = {
+                "session.id": f"{service_name}-session-{uuid.uuid4()}",
+                "user.id": f"{service_name}-user",
+                "service.name": service_name,
+                "service.version": version,
+                "deployment.environment": environment,
+                "dd.tags": ",".join(all_tags)
+            }
+            
+            print(f"✅ Datadog observability provider initialized successfully")
+            print(f"📊 Trace attributes: {self.trace_attributes}")
+            
+            return self.trace_attributes
+            
+        except Exception as e:
+            print(f"❌ Error initializing Datadog observability provider: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            return {}
+    
+    def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get Datadog metrics client configuration."""
+        provider_config = self.get_provider_config()
+        return {
+            "type": "datadog_statsd",
+            "api_key": provider_config.get("api_key", ""),
+            "site": provider_config.get("site", "datadoghq.com"),
+            "tags": [f"service:{service_name}", f"env:{environment}"]
+        }
+    
+    def _get_log_client_config(self) -> Dict[str, Any]:
+        """Get Datadog log client configuration."""
+        provider_config = self.get_provider_config()
+        return {
+            "type": "datadog_logs_api",
+            "api_key": provider_config.get("api_key", ""),
+            "site": provider_config.get("site", "datadoghq.com")
+        }
+    
+    def _send_metrics_with_client(self, metrics_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Send metrics using Datadog StatsD - minimal implementation."""
+        from datadog import statsd
+        tags = client_config["tags"]
+        service_name = metrics_data["service_name"]
+        
+        # Simple metric sending using extracted data
+        if metrics_data["tokens"]:
+            tokens = metrics_data["tokens"]
+            statsd.gauge(f"{service_name}.tokens.total", tokens["total"], tags=tags)
+            print(f"✅ Sent {tokens['total']} tokens to Datadog")
+        
+        if metrics_data["performance"]:
+            perf = metrics_data["performance"]
+            if "latency_ms" in perf:
+                statsd.gauge(f"{service_name}.latency.ms", perf["latency_ms"], tags=tags)
+                print(f"✅ Sent {perf['latency_ms']}ms latency to Datadog")
+    
+    def _emit_log_with_client(self, log_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Emit log using Datadog Logs API - minimal implementation."""
+        from datadog import api
+        api.Log.create(
+            message=log_data["message"],
+            level=log_data["level"].lower(),
+            service=log_data["service"],
+            tags=[f"env:{log_data['environment']}", f"logger:{log_data['logger']}"]
+        )
+    
+    def _cleanup_environment_variables(self):
+        """Clean up Datadog-specific environment variables."""
+        datadog_env_vars = [
+            "DD_API_KEY", "DD_SITE", "DD_ENV", "DD_SERVICE", "DD_VERSION",
+            "DD_TRACE_AGENT_URL", "DD_TRACE_API_VERSION", "DD_AGENT_HOST",
+            "DD_DOGSTATSD_PORT", "DD_APM_DD_URL", "DD_LLMOBS_INTAKE_URL",
+            "DD_LOGS_INJECTION", "DD_LLMOBS_ENABLED", "DD_LLMOBS_ML_APP",
+            "DD_LLMOBS_AGENTLESS_ENABLED", "DD_TRACE_TLS_CERT_FILE",
+            "DD_TRACE_TLS_CA_CERT", "DD_TRACE_TLS_VERIFY", "DD_LLMOBS_TLS_VERIFY",
+            "DD_TRACE_WRITER_BUFFER_SIZE_BYTES", "DD_TRACE_WRITER_MAX_PAYLOAD_SIZE",
+            "DD_TRACE_WRITER_INTERVAL_SECONDS", "DATADOG_METRICS_ENABLED",
+            "DATADOG_SERVICE_NAME", "DATADOG_ENVIRONMENT"
+        ]
+        
+        removed_count = self._cleanup_environment_variables_by_list(datadog_env_vars)
+        logging.debug(f"Removed {removed_count} Datadog environment variables")
+    
+    def _provider_specific_cleanup(self):
+        """Datadog-specific cleanup for provider transitions."""
+        try:
+            # Disable ddtrace instrumentation
+            try:
+                from ddtrace import patch
+                # Unfortunately, ddtrace doesn't provide unpatch methods
+                # Best we can do is mark for next restart
+                logging.warning("Datadog ddtrace instrumentation cannot be cleanly disabled - requires agent restart for full transition")
+            except ImportError:
+                pass
+            
+            # Clean up LLMObs if it was enabled
+            try:
+                from ddtrace.llmobs import LLMObs
+                if hasattr(LLMObs, 'disable'):
+                    LLMObs.disable()
+                    logging.debug("Datadog LLMObs disabled")
+            except (ImportError, AttributeError):
+                pass
+            
+            # Reset SSL context modifications
+            try:
+                import ssl
+                ssl._create_default_https_context = ssl.create_default_context
+                logging.debug("Restored default SSL context")
+            except Exception:
+                pass
+            
+            logging.info("Datadog-specific cleanup completed")
+            
+        except Exception as e:
+            logging.warning(f"Error in Datadog-specific cleanup: {e}")
+    
+    def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get configuration for Strands get_tracer() to send traces to Datadog."""
+        # CRITICAL: Only return config if this provider is currently active
+        if not self._validate_provider_is_active():
+            logging.debug(f"Skipping tracer config for inactive {self.provider_name} provider")
+            return {}
+            
+        try:
+            provider_config = self.get_provider_config()
+            site = provider_config.get("site", "datadoghq.com")
+            api_key = provider_config.get("api_key", "")
+            
+            # Return Datadog OTLP configuration for Strands tracer
+            return {
+                "service_name": service_name,
+                "otlp_endpoint": f"https://otlp-intake.{site}/v1/traces",
+                "headers": {"DD-API-KEY": api_key},
+                "enable_console_export": False,
+                "resource_attributes": {
+                    "service.name": service_name,
+                    "service.version": "1.0.0",
+                    "deployment.environment": environment
+                }
+            }
+            
+        except Exception as e:
+            print(f"⚠️ Error getting Strands tracer config for Datadog: {e}")
+            return {}
diff --git a/application_src/common/observability/dynatrace.py b/application_src/common/observability/dynatrace.py
index eadf859..298d681 100644
--- a/application_src/common/observability/dynatrace.py
+++ b/application_src/common/observability/dynatrace.py
@@ -5,6 +5,7 @@
 
 import os
 import uuid
+import logging
 from typing import Dict, Any
 from .base import BaseObservabilityProvider
 
@@ -82,59 +83,25 @@ def initialize(self) -> Dict[str, Any]:
             return {}
     
     def _initialize_opentelemetry(self, dt_token: str, otlp_endpoint: str):
-        """Initialize OpenTelemetry with OTLP exporter for Dynatrace and LLMetry for LLM tracing."""
-        try:
-            # Import OpenTelemetry components
-            from opentelemetry import trace
-            from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-            from opentelemetry.sdk.trace import TracerProvider
-            from opentelemetry.sdk.trace.export import BatchSpanProcessor
-            from opentelemetry.sdk.resources import Resource
-            
-            print("📦 OpenTelemetry packages imported successfully")
-            
-            # Create resource with service information
-            resource = Resource.create({
+        """Initialize OpenTelemetry using common base class method."""
+        # Use common initialization with Dynatrace-specific config
+        otlp_config = {
+            "endpoint": otlp_endpoint,
+            "headers": {"Authorization": f"Api-Token {dt_token}"},
+            "resource_attributes": {
                 "service.name": "genai-in-a-box",
                 "service.version": "1.0.0",
                 "deployment.environment": "production"
-            })
-            
-            # Set up tracer provider
-            tracer_provider = TracerProvider(resource=resource)
-            trace.set_tracer_provider(tracer_provider)
-            
-            print("🔧 TracerProvider configured")
-            
-            # Create OTLP exporter with Dynatrace API token
-            headers = {"Authorization": f"Api-Token {dt_token}"}
-            
-            print(f"📡 OTLP Endpoint: {otlp_endpoint}")
-            print(f"🔑 Auth Header: Api-Token {dt_token[:20]}...")
-            
-            otlp_exporter = OTLPSpanExporter(
-                endpoint=otlp_endpoint,
-                headers=headers
-            )
-            
-            # Add span processor
-            span_processor = BatchSpanProcessor(otlp_exporter)
-            tracer_provider.add_span_processor(span_processor)
-            
-            print("✅ OpenTelemetry configured with OTLP exporter for Dynatrace")
-            
-            # Initialize LLMetry for LLM-specific tracing
-            self._initialize_llmetry()
-            
-        except ImportError as import_error:
-            print(f"❌ Missing OpenTelemetry dependencies: {import_error}")
-            print("   Install with: pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp traceloop-sdk")
-            raise
-        except Exception as setup_error:
-            print(f"❌ OpenTelemetry setup failed: {setup_error}")
-            import traceback
-            traceback.print_exc()
-            raise
+            }
+        }
+        
+        print(f"📡 OTLP Endpoint: {otlp_endpoint}")
+        print(f"🔑 Auth Header: Api-Token {dt_token[:20]}...")
+        
+        self._initialize_opentelemetry_common(otlp_config)
+        
+        # Initialize LLMetry after common setup
+        self._initialize_llmetry()
     
     def _initialize_llmetry(self):
         """Initialize LLMetry for LLM-specific observability."""
@@ -161,3 +128,115 @@ def _initialize_llmetry(self):
             print("   Falling back to basic OpenTelemetry tracing")
             import traceback
             traceback.print_exc()
+    
+    def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get Dynatrace metrics client configuration."""
+        provider_config = self.get_provider_config()
+        return {
+            "type": "dynatrace_otlp_metrics",
+            "dt_token": provider_config.get("dt_token", ""),
+            "otlp_endpoint": provider_config.get("otlp_endpoint", "").replace('/v1/traces', '/v1/metrics'),
+            "tags": {"service": service_name, "env": environment}
+        }
+    
+    def _get_log_client_config(self) -> Dict[str, Any]:
+        """Get Dynatrace log client configuration."""
+        return {"type": "dynatrace_otlp_spans"}
+    
+    def _send_metrics_with_client(self, metrics_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Send metrics using Dynatrace OTLP - minimal implementation."""
+        from opentelemetry import metrics as otel_metrics
+        from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
+        from opentelemetry.sdk.metrics import MeterProvider
+        from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+        
+        # Create OTLP metric exporter
+        exporter = OTLPMetricExporter(
+            endpoint=client_config["otlp_endpoint"],
+            headers={"Authorization": f"Api-Token {client_config['dt_token']}"}
+        )
+        reader = PeriodicExportingMetricReader(exporter, export_interval_millis=10000)
+        meter = MeterProvider(metric_readers=[reader]).get_meter("strands")
+        
+        # Send metrics
+        if metrics_data["tokens"]:
+            counter = meter.create_up_down_counter("strands_tokens")
+            counter.add(metrics_data["tokens"]["total"], client_config["tags"])
+            print(f"✅ Sent {metrics_data['tokens']['total']} tokens to Dynatrace")
+    
+    def _emit_log_with_client(self, log_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Emit log using Dynatrace OTLP spans - minimal implementation."""
+        from opentelemetry import trace
+        tracer = trace.get_tracer("strands-logs")
+        with tracer.start_as_current_span("strands_log") as span:
+            span.set_attribute("log.message", log_data["message"])
+            span.set_attribute("log.level", log_data["level"])
+            span.set_attribute("dt.trace_sampled", "true")
+    
+    def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get configuration for Strands get_tracer() to send traces to Dynatrace."""
+        # CRITICAL: Only return config if this provider is currently active
+        if not self._validate_provider_is_active():
+            logging.debug(f"Skipping tracer config for inactive {self.provider_name} provider")
+            return {}
+            
+        try:
+            provider_config = self.get_provider_config()
+            dt_token = provider_config.get("dt_token", "")
+            otlp_endpoint = provider_config.get("otlp_endpoint", "")
+            
+            # Ensure endpoint is for traces
+            if not otlp_endpoint.endswith('/v1/traces'):
+                if otlp_endpoint.endswith('/'):
+                    otlp_endpoint = otlp_endpoint + 'v1/traces'
+                else:
+                    otlp_endpoint = otlp_endpoint + '/v1/traces'
+            
+            # Return Dynatrace OTLP configuration for Strands tracer
+            return {
+                "service_name": service_name,
+                "otlp_endpoint": otlp_endpoint,
+                "headers": {"Authorization": f"Api-Token {dt_token}"},
+                "enable_console_export": False,
+                "resource_attributes": {
+                    "service.name": service_name,
+                    "service.version": "1.0.0",
+                    "deployment.environment": environment,
+                    "dt.trace_sampled": "true"
+                }
+            }
+            
+        except Exception as e:
+            print(f"⚠️ Error getting Strands tracer config for Dynatrace: {e}")
+            return {}
+    
+    def _cleanup_environment_variables(self):
+        """Clean up Dynatrace-specific environment variables."""
+        import os
+        dynatrace_env_vars = [
+            "DT_TOKEN", "OTLP_ENDPOINT"
+        ]
+        
+        removed_count = 0
+        for env_var in dynatrace_env_vars:
+            if env_var in os.environ:
+                del os.environ[env_var]
+                removed_count += 1
+        
+        logging.debug(f"Removed {removed_count} Dynatrace environment variables")
+    
+    def _provider_specific_cleanup(self):
+        """Dynatrace-specific cleanup for provider transitions."""
+        try:
+            # Clean up LLMetry/Traceloop if it was enabled
+            try:
+                from traceloop.sdk import Traceloop
+                # Traceloop doesn't have a clean shutdown method, but we can try to disable
+                logging.debug("Dynatrace LLMetry cleanup - requires restart for full cleanup")
+            except ImportError:
+                pass
+            
+            logging.info("Dynatrace-specific cleanup completed")
+            
+        except Exception as e:
+            logging.warning(f"Error in Dynatrace-specific cleanup: {e}")
diff --git a/application_src/common/observability/elastic.py b/application_src/common/observability/elastic.py
index b173dba..78ca04c 100644
--- a/application_src/common/observability/elastic.py
+++ b/application_src/common/observability/elastic.py
@@ -96,141 +96,142 @@ def initialize(self) -> Dict[str, Any]:
             return {}
     
     def _initialize_opentelemetry(self, otlp_endpoint: str, api_key: str):
-        """Initialize OpenTelemetry with OTLP exporter for Elastic."""
-        try:
-            # Import OpenTelemetry components
-            from opentelemetry import trace
-            from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-            from opentelemetry.sdk.trace import TracerProvider
-            from opentelemetry.sdk.trace.export import BatchSpanProcessor
-            from opentelemetry.sdk.resources import Resource
-            
-            logger.debug("OpenTelemetry packages imported successfully")
-            
-            # Get dataset and namespace from config
-            provider_config = self.get_provider_config()
-            dataset = provider_config.get("dataset", "generic.otel")
-            namespace = provider_config.get("namespace", "default")
-            
-            # Get service name and version (same logic as trace_attributes)
-            service_name = (
-                self.config.get("agent_name") or 
-                os.environ.get('AGENT_NAME') or 
-                os.environ.get('SERVICE_NAME') or 
-                'genai-in-a-box'
-            )
-            service_version = (
-                self.config.get("agent_version") or 
-                os.environ.get('SERVICE_VERSION') or 
-                '1.0.0'
-            )
-            
-            # Create resource with service information
-            # Include data stream routing attributes for Elastic
-            resource = Resource.create({
+        """Initialize OpenTelemetry using common base class method."""
+        # Get dataset and namespace from config
+        provider_config = self.get_provider_config()
+        dataset = provider_config.get("dataset", "generic.otel")
+        namespace = provider_config.get("namespace", "default")
+        
+        # Get service name and version
+        service_name = self.config.get("agent_name", "genai-in-a-box")
+        service_version = self.config.get("agent_version", "1.0.0")
+        
+        # Ensure correct endpoint path
+        if not otlp_endpoint.endswith('/v1/traces'):
+            if otlp_endpoint.endswith('/'):
+                otlp_endpoint = otlp_endpoint + 'v1/traces'
+            else:
+                otlp_endpoint = otlp_endpoint + '/v1/traces'
+        
+        # Use common initialization with Elastic-specific config
+        otlp_config = {
+            "endpoint": otlp_endpoint,
+            "headers": {"Authorization": f"ApiKey {api_key}"},
+            "resource_attributes": {
                 "service.name": service_name,
                 "service.version": service_version,
                 "deployment.environment": os.environ.get('ENVIRONMENT', 'production'),
                 "data_stream.dataset": dataset,
                 "data_stream.namespace": namespace
-            })
-            
-            # Set up tracer provider
-            tracer_provider = TracerProvider(resource=resource)
-            trace.set_tracer_provider(tracer_provider)
-            
-            logger.debug("TracerProvider configured")
-            
-            # Create OTLP exporter with Elastic API Key authentication
-            headers = {"Authorization": f"ApiKey {api_key}"}
+            }
+        }
+        
+        logger.debug(f"   Final OTLP traces endpoint: {otlp_endpoint}")
+        logger.debug("   Headers: Authorization=ApiKey [REDACTED]")
+        
+        self._initialize_opentelemetry_common(otlp_config)
+    
+    def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get Elastic metrics client configuration."""
+        provider_config = self.get_provider_config()
+        return {
+            "type": "elastic_otlp_metrics",
+            "api_key": provider_config.get("api_key", ""),
+            "otlp_endpoint": provider_config.get("otlp_endpoint", "").replace('/v1/traces', '/v1/metrics'),
+            "tags": {"service": service_name, "env": environment}
+        }
+    
+    def _get_log_client_config(self) -> Dict[str, Any]:
+        """Get Elastic log client configuration."""
+        return {"type": "elastic_otlp_spans"}
+    
+    def _send_metrics_with_client(self, metrics_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Send metrics using Elastic OTLP - minimal implementation."""
+        from opentelemetry import metrics as otel_metrics
+        from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
+        from opentelemetry.sdk.metrics import MeterProvider
+        from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+        
+        # Create OTLP metric exporter
+        exporter = OTLPMetricExporter(
+            endpoint=client_config["otlp_endpoint"],
+            headers={"Authorization": f"ApiKey {client_config['api_key']}"}
+        )
+        reader = PeriodicExportingMetricReader(exporter, export_interval_millis=5000)
+        meter = MeterProvider(metric_readers=[reader]).get_meter("strands")
+        
+        # Send metrics
+        if metrics_data["tokens"]:
+            counter = meter.create_counter("tokens_total")
+            counter.add(metrics_data["tokens"]["total"], client_config["tags"])
+            print(f"✅ Sent {metrics_data['tokens']['total']} tokens to Elastic")
+    
+    def _emit_log_with_client(self, log_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Emit log using Elastic OTLP spans - minimal implementation."""
+        from opentelemetry import trace
+        tracer = trace.get_tracer("strands-logs")
+        with tracer.start_as_current_span("strands_log") as span:
+            span.set_attribute("log.message", log_data["message"])
+            span.set_attribute("log.level", log_data["level"])
+    
+    def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get configuration for Strands get_tracer() to send traces to Elastic."""
+        # CRITICAL: Only return config if this provider is currently active
+        if not self._validate_provider_is_active():
+            logging.debug(f"Skipping tracer config for inactive {self.provider_name} provider")
+            return {}
             
-            logger.debug(f"Data Stream: traces-{dataset}-{namespace}")
-            logger.debug("OTLP Endpoint Configuration:")
-            logger.debug(f"   Base endpoint from config: {otlp_endpoint}")
+        try:
+            provider_config = self.get_provider_config()
+            api_key = provider_config.get("api_key", "")
+            otlp_endpoint = provider_config.get("otlp_endpoint", "")
             
-            # Ensure the endpoint has the correct OTLP traces path
-            # Elastic OTLP endpoint should end with /v1/traces
+            # Ensure endpoint is for traces
             if not otlp_endpoint.endswith('/v1/traces'):
                 if otlp_endpoint.endswith('/'):
                     otlp_endpoint = otlp_endpoint + 'v1/traces'
                 else:
                     otlp_endpoint = otlp_endpoint + '/v1/traces'
             
-            logger.debug(f"   Final OTLP traces endpoint: {otlp_endpoint}")
-            logger.debug("   Headers: Authorization=ApiKey [REDACTED]")
+            # Return Elastic OTLP configuration for Strands tracer
+            return {
+                "service_name": service_name,
+                "otlp_endpoint": otlp_endpoint,
+                "headers": {"Authorization": f"ApiKey {api_key}"},
+                "enable_console_export": False,
+                "resource_attributes": {
+                    "service.name": service_name,
+                    "service.version": "1.0.0",
+                    "deployment.environment": environment,
+                    "data_stream.dataset": provider_config.get("dataset", "generic.otel"),
+                    "data_stream.namespace": provider_config.get("namespace", "default")
+                }
+            }
             
-            otlp_exporter = OTLPSpanExporter(
-                endpoint=otlp_endpoint,
-                headers=headers
-            )
+        except Exception as e:
+            print(f"⚠️ Error getting Strands tracer config for Elastic: {e}")
+            return {}
+    
+    def _cleanup_environment_variables(self):
+        """Clean up Elastic-specific environment variables."""
+        import os
+        elastic_env_vars = [
+            "ELASTIC_API_KEY", "OTEL_EXPORTER_OTLP_ENDPOINT", "OTEL_EXPORTER_OTLP_HEADERS"
+        ]
+        
+        removed_count = 0
+        for env_var in elastic_env_vars:
+            if env_var in os.environ:
+                del os.environ[env_var]
+                removed_count += 1
+        
+        logging.debug(f"Removed {removed_count} Elastic environment variables")
+    
+    def _provider_specific_cleanup(self):
+        """Elastic-specific cleanup for provider transitions."""
+        try:
+            # Elastic uses standard OpenTelemetry, so minimal specific cleanup needed
+            logging.info("Elastic-specific cleanup completed")
             
-            logger.debug("OTLP Exporter created successfully")
-            logger.debug(f"   Exporter endpoint: {otlp_exporter._endpoint}")
-            logger.debug(f"   Exporter will send traces to: {otlp_endpoint}")
-            
-            # Wrap the exporter to add detailed error logging and resilience
-            class ResilientOTLPSpanExporter:
-                def __init__(self, wrapped_exporter):
-                    self._wrapped = wrapped_exporter
-                    self._failed_exports = 0
-                    self._max_failures = 5  # Stop trying after 5 consecutive failures
-                    
-                def export(self, spans):
-                    # Skip export if we've had too many failures
-                    if self._failed_exports >= self._max_failures:
-                        from opentelemetry.sdk.trace.export import SpanExportResult
-                        logger.warning(f"OTLP Export Skipped: Too many consecutive failures ({self._failed_exports})")
-                        return SpanExportResult.FAILURE
-                    
-                    try:
-                        logger.debug("OTLP Export Debug:")
-                        logger.debug(f"   Sending {len(spans)} spans to: {self._wrapped._endpoint}")
-                        logger.debug(f"   Request headers: {self._wrapped._headers}")
-                        result = self._wrapped.export(spans)
-                        logger.debug(f"   Export result: {result}")
-                        
-                        # Reset failure counter on success
-                        if result.name == 'SUCCESS':
-                            self._failed_exports = 0
-                        else:
-                            self._failed_exports += 1
-                            logger.warning(f"Export failed, failure count: {self._failed_exports}")
-                            
-                        return result
-                    except Exception as e:
-                        self._failed_exports += 1
-                        logger.error(f"OTLP Export Error (failure {self._failed_exports}/{self._max_failures}):")
-                        logger.error(f"   Error type: {type(e).__name__}")
-                        logger.error(f"   Error message: {str(e)}")
-                        logger.error(f"   Endpoint attempted: {self._wrapped._endpoint}")
-                        
-                        # Only log full traceback for first few failures to reduce log spam
-                        if self._failed_exports <= 3:
-                            logger.exception("OTLP Export Exception details:")
-                        
-                        # Return failure instead of raising to prevent crash
-                        from opentelemetry.sdk.trace.export import SpanExportResult
-                        return SpanExportResult.FAILURE
-                        
-                def shutdown(self):
-                    return self._wrapped.shutdown()
-                    
-                def force_flush(self, timeout_millis: int = 30000):
-                    return self._wrapped.force_flush(timeout_millis)
-            
-            # Wrap the exporter for resilience and debugging
-            resilient_exporter = ResilientOTLPSpanExporter(otlp_exporter)
-            
-            # Add span processor with resilient exporter
-            span_processor = BatchSpanProcessor(resilient_exporter)
-            tracer_provider.add_span_processor(span_processor)
-            
-            logger.info("OpenTelemetry configured with OTLP exporter for Elastic")
-            
-        except ImportError as import_error:
-            logger.error(f"Missing OpenTelemetry dependencies: {import_error}")
-            logger.error("Install with: pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp")
-            raise
-        except Exception as setup_error:
-            logger.exception("OpenTelemetry setup failed")
-            raise
+        except Exception as e:
+            logging.warning(f"Error in Elastic-specific cleanup: {e}")
diff --git a/application_src/common/observability/langfuse.py b/application_src/common/observability/langfuse.py
index 2feaba4..435df7d 100644
--- a/application_src/common/observability/langfuse.py
+++ b/application_src/common/observability/langfuse.py
@@ -5,6 +5,7 @@
 
 import os
 import base64
+import logging
 from typing import Dict, Any
 from .base import BaseObservabilityProvider
 from secure_logging_utils import SecureLogger
@@ -84,58 +85,110 @@ def initialize(self) -> Dict[str, Any]:
             return {}
     
     def _initialize_opentelemetry(self, public_key: str, secret_key: str, host: str):
-        """Initialize OpenTelemetry with OTLP exporter for Langfuse - THE MISSING PIECE!"""
-        try:
-            # Import OpenTelemetry components
-            from opentelemetry import trace
-            from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-            from opentelemetry.sdk.trace import TracerProvider
-            from opentelemetry.sdk.trace.export import BatchSpanProcessor
-            from opentelemetry.sdk.resources import Resource
-            
-            print("📦 OpenTelemetry packages imported successfully")
-            
-            # Create resource with service information
-            resource = Resource.create({
+        """Initialize OpenTelemetry using common base class method."""
+        # Build auth header
+        auth_string = f"{public_key}:{secret_key}"
+        auth_header = base64.b64encode(auth_string.encode()).decode()
+        
+        # Use common initialization with Langfuse-specific config
+        otlp_config = {
+            "endpoint": f"{host}/api/public/otel/v1/traces",
+            "headers": {"Authorization": f"Basic {auth_header}"},
+            "resource_attributes": {
                 "service.name": "genai-in-a-box",
                 "service.version": "1.0.0",
                 "deployment.environment": "production"
-            })
-            
-            # Set up tracer provider
-            tracer_provider = TracerProvider(resource=resource)
-            trace.set_tracer_provider(tracer_provider)
+            }
+        }
+        
+        print(f"📡 OTLP Endpoint: {otlp_config['endpoint']}")
+        print(f"🔑 Auth Header: ✅ Configured")
+        
+        self._initialize_opentelemetry_common(otlp_config)
+    
+    def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get Langfuse metrics client configuration."""
+        return {
+            "type": "langfuse_events",
+            "service": service_name,
+            "environment": environment
+        }
+    
+    def _get_log_client_config(self) -> Dict[str, Any]:
+        """Get Langfuse log client configuration."""
+        return {"type": "langfuse_events"}
+    
+    def _send_metrics_with_client(self, metrics_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Send metrics using Langfuse events - minimal implementation."""
+        import langfuse
+        client = langfuse.Langfuse()
+        
+        # Send token metrics
+        if metrics_data["tokens"]:
+            client.event(name="strands_tokens", metadata={**metrics_data["tokens"], **client_config})
+            print(f"✅ Sent {metrics_data['tokens']['total']} tokens to Langfuse")
+        
+        # Send performance metrics
+        if metrics_data["performance"]:
+            client.event(name="strands_performance", metadata={**metrics_data["performance"], **client_config})
+            print(f"✅ Sent latency to Langfuse: {metrics_data['performance'].get('latency_ms', 0)}ms")
+    
+    def _emit_log_with_client(self, log_data: Dict[str, Any], client_config: Dict[str, Any]):
+        """Emit log using Langfuse events - minimal implementation."""
+        import langfuse
+        client = langfuse.Langfuse()
+        client.event(name="strands_log", metadata=log_data)
+    
+    def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict[str, Any]:
+        """Get configuration for Strands get_tracer() to send traces to Langfuse."""
+        # CRITICAL: Only return config if this provider is currently active
+        if not self._validate_provider_is_active():
+            logging.debug(f"Skipping tracer config for inactive {self.provider_name} provider")
+            return {}
             
-            print("🔧 TracerProvider configured")
+        try:
+            provider_config = self.get_provider_config()
+            public_key = provider_config.get("public_key", "")
+            secret_key = provider_config.get("secret_key", "")
+            host = provider_config.get("host", "https://us.cloud.langfuse.com")
             
-            # Build auth header
+            # Build auth header for OTLP using base64 encoding
             auth_string = f"{public_key}:{secret_key}"
             auth_header = base64.b64encode(auth_string.encode()).decode()
             
-            # Create OTLP exporter with proper headers
-            endpoint = f"{host}/api/public/otel/v1/traces"
+            # Use DRY helper to build standard tracer config
+            endpoint = self._normalize_otlp_endpoint(f"{host}/api/public/otel", "/v1/traces")
             headers = {"Authorization": f"Basic {auth_header}"}
             
-            print(f"📡 OTLP Endpoint: {endpoint}")
-            print(f"🔑 Auth Header: ✅ Configured")
-            
-            otlp_exporter = OTLPSpanExporter(
-                endpoint=endpoint,
-                headers=headers
-            )
+            return self._build_standard_tracer_config(service_name, environment, endpoint, headers)
             
-            # Add span processor
-            span_processor = BatchSpanProcessor(otlp_exporter)
-            tracer_provider.add_span_processor(span_processor)
+        except Exception as e:
+            print(f"⚠️ Error getting Strands tracer config for Langfuse: {e}")
+            return {}
+    
+    def _cleanup_environment_variables(self):
+        """Clean up Langfuse-specific environment variables."""
+        langfuse_env_vars = [
+            "LANGFUSE_PUBLIC_KEY", "LANGFUSE_SECRET_KEY", "LANGFUSE_HOST", "PROJECT_NAME"
+        ]
+        removed_count = self._cleanup_environment_variables_by_list(langfuse_env_vars)
+        logging.debug(f"Removed {removed_count} Langfuse environment variables")
+    
+    def _provider_specific_cleanup(self):
+        """Langfuse-specific cleanup for provider transitions."""
+        try:
+            # Langfuse doesn't have global instrumentation like Datadog's ddtrace
+            # But we should clean up any lingering client instances
+            try:
+                import langfuse
+                # Force close any existing client connections
+                if hasattr(langfuse, '_client_manager'):
+                    langfuse._client_manager.shutdown_all()
+                logging.debug("Langfuse client connections closed")
+            except (ImportError, AttributeError):
+                pass
             
-            print("✅ OpenTelemetry configured with OTLP exporter for Langfuse")
+            logging.info("Langfuse-specific cleanup completed")
             
-        except ImportError as import_error:
-            print(f"❌ Missing OpenTelemetry dependencies: {import_error}")
-            print("   Install with: pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp")
-            raise
-        except Exception as setup_error:
-            print(f"❌ OpenTelemetry setup failed: {setup_error}")
-            import traceback
-            traceback.print_exc()
-            raise
+        except Exception as e:
+            logging.warning(f"Error in Langfuse-specific cleanup: {e}")
diff --git a/application_src/common/requirements.txt b/application_src/common/requirements.txt
index 1d2c51e..4b09df8 100644
--- a/application_src/common/requirements.txt
+++ b/application_src/common/requirements.txt
@@ -13,6 +13,7 @@ sseclient-py
 uv
 mem0ai
 langfuse
+ddtrace  # Official Datadog tracing, metrics, logs, and LLM observability library
 opensearch-py
 faiss-cpu
 langchain
diff --git a/application_src/configuration-api/app/models/form_schema.py b/application_src/configuration-api/app/models/form_schema.py
index f6fdb5a..ee1ab3e 100644
--- a/application_src/configuration-api/app/models/form_schema.py
+++ b/application_src/configuration-api/app/models/form_schema.py
@@ -817,6 +817,92 @@ def get_observability_schemas() -> Dict[str, ProviderFormSchema]:
                         default_value="default"
                     )
                 ]
+            ),
+            
+            "datadog": ProviderFormSchema(
+                provider_name="datadog",
+                provider_label="Datadog",
+                description="Complete Datadog observability platform using official ddtrace library - supports traces, logs, metrics, and specialized LLM observability for AI applications",
+                fields=[
+                    FormField(
+                        name="api_key",
+                        type=FieldType.PASSWORD,
+                        label="Datadog API Key",
+                        placeholder="Enter Datadog API key",
+                        help_text="Your Datadog API key for authentication",
+                        required=True,
+                        secure=True
+                    ),
+                    FormField(
+                        name="site",
+                        type=FieldType.SELECT,
+                        label="Datadog Site",
+                        help_text="Datadog site/region for your organization",
+                        required=False,
+                        default_value="datadoghq.com",
+                        options=[
+                            SelectOption(value="datadoghq.com", label="US1 (datadoghq.com)"),
+                            SelectOption(value="us3.datadoghq.com", label="US3 (us3.datadoghq.com)"),
+                            SelectOption(value="us5.datadoghq.com", label="US5 (us5.datadoghq.com)"),
+                            SelectOption(value="datadoghq.eu", label="EU (datadoghq.eu)"),
+                            SelectOption(value="ap1.datadoghq.com", label="AP1 (ap1.datadoghq.com)"),
+                            SelectOption(value="ap2.datadoghq.com", label="AP2 (ap2.datadoghq.com)"),
+                            SelectOption(value="us1-fed.datadoghq.com", label="US1-FED (us1-fed.datadoghq.com)")
+                        ]
+                    ),
+                    FormField(
+                        name="environment",
+                        type=FieldType.TEXT,
+                        label="Environment (Optional)",
+                        placeholder="production",
+                        help_text="Environment tag for organizing your services (e.g., production, staging, development)",
+                        required=False,
+                        default_value="production"
+                    ),
+                    FormField(
+                        name="service_name",
+                        type=FieldType.TEXT,
+                        label="Service Name (Optional)",
+                        placeholder="Leave empty to use agent name",
+                        help_text="Custom service name for Datadog (defaults to agent name if not specified)",
+                        required=False,
+                        default_value=""
+                    ),
+                    FormField(
+                        name="version",
+                        type=FieldType.TEXT,
+                        label="Service Version (Optional)",
+                        placeholder="1.0.0",
+                        help_text="Version tag for tracking deployments and releases",
+                        required=False,
+                        default_value="1.0.0"
+                    ),
+                    FormField(
+                        name="enable_llm_obs",
+                        type=FieldType.CHECKBOX,
+                        label="Enable LLM Observability",
+                        help_text="Enable specialized AI/ML observability for LLM interactions, prompt tracking, and cost analysis",
+                        required=False,
+                        default_value=True
+                    ),
+                    FormField(
+                        name="enable_logs",
+                        type=FieldType.CHECKBOX,
+                        label="Enable Log Collection",
+                        help_text="Enable direct log submission to Datadog with automatic trace correlation",
+                        required=False,
+                        default_value=True
+                    ),
+                    FormField(
+                        name="tags",
+                        type=FieldType.TEXTAREA,
+                        label="Additional Tags (Optional)",
+                        placeholder="service:genai-agent\nteam:ai-platform\nversion:1.0.0",
+                        help_text="Additional tags for organizing metrics and logs (one tag per line, format: key:value)",
+                        required=False,
+                        rows=3
+                    )
+                ]
             )
         }
     
diff --git a/application_src/configuration-api/app/services/bedrock_model_service.py b/application_src/configuration-api/app/services/bedrock_model_service.py
index a9e2051..be0d04f 100644
--- a/application_src/configuration-api/app/services/bedrock_model_service.py
+++ b/application_src/configuration-api/app/services/bedrock_model_service.py
@@ -160,10 +160,23 @@ async def _fetch_models_from_aws(self) -> Dict[str, Any]:
             logger.info("Fetching foundation models...")
             response = self.bedrock_client.list_foundation_models()
             
+            total_foundation_models = len(response.get('modelSummaries', []))
+            active_foundation_models = 0
+            skipped_foundation_models = 0
+            
             for model in response.get('modelSummaries', []):
                 model_id = model['modelId']
                 model_name = model['modelName']
                 provider_name = model['providerName']
+                model_lifecycle_status = model.get('modelLifecycleStatus', 'UNKNOWN')
+                
+                # Filter out models that are not ACTIVE
+                if model_lifecycle_status != 'ACTIVE':
+                    logger.debug(f"Skipping model {model_id} with lifecycle status: {model_lifecycle_status}")
+                    skipped_foundation_models += 1
+                    continue
+                
+                active_foundation_models += 1
                 
                 # Get detailed model information for foundation models
                 try:
@@ -211,7 +224,8 @@ async def _fetch_models_from_aws(self) -> Dict[str, Any]:
                         'output_modalities': output_modalities,
                         'customizations_supported': model_details_info.get('customizationsSupported', []),
                         'inference_types_supported': model_details_info.get('inferenceTypesSupported', []),
-                        'response_streaming_supported': model_details_info.get('responseStreamingSupported', False)
+                        'response_streaming_supported': model_details_info.get('responseStreamingSupported', False),
+                        'model_lifecycle_status': model_lifecycle_status
                     }
                     
                 except Exception as detail_error:
@@ -227,12 +241,16 @@ async def _fetch_models_from_aws(self) -> Dict[str, Any]:
                         'output_modalities': ['TEXT'],
                         'customizations_supported': [],
                         'inference_types_supported': [],
-                        'response_streaming_supported': True
+                        'response_streaming_supported': True,
+                        'model_lifecycle_status': model_lifecycle_status
                     }
                 
                 # Add to all models list
                 models_by_capability['all_models'].append(model_details[model_id])
             
+            # Log filtering summary for foundation models
+            logger.info(f"Foundation model filtering summary: {active_foundation_models} ACTIVE models cached, {skipped_foundation_models} non-ACTIVE models skipped (out of {total_foundation_models} total)")
+            
             # Step 2: Fetch cross-region inference profiles
             logger.info("Fetching cross-region inference profiles...")
             try:
@@ -495,12 +513,13 @@ def generate_form_schema_options(self, capability: str = "text_generation") -> L
     def get_available_foundation_models(self, force_refresh: bool = False) -> Dict[str, Any]:
         """
         Get all available foundation models from global cache or AWS Bedrock.
+        Only includes models with ACTIVE lifecycle status.
         
         Args:
             force_refresh: Whether to bypass cache and fetch fresh data
             
         Returns:
-            Dictionary containing available models with metadata
+            Dictionary containing available ACTIVE models with metadata
         """
         try:
             # Try global cache first for better performance
@@ -516,6 +535,10 @@ def get_available_foundation_models(self, force_refresh: bool = False) -> Dict[s
             # Fetch models from AWS Bedrock
             response = self.bedrock_client.list_foundation_models()
             
+            total_foundation_models = len(response.get('modelSummaries', []))
+            active_foundation_models = 0
+            skipped_foundation_models = 0
+            
             # Process and categorize models
             models_by_capability = {
                 "text_generation": [],
@@ -530,6 +553,15 @@ def get_available_foundation_models(self, force_refresh: bool = False) -> Dict[s
                 model_id = model['modelId']
                 model_name = model['modelName']
                 provider_name = model['providerName']
+                model_lifecycle_status = model.get('modelLifecycleStatus', 'UNKNOWN')
+                
+                # Filter out models that are not ACTIVE
+                if model_lifecycle_status != 'ACTIVE':
+                    logger.debug(f"Skipping model {model_id} with lifecycle status: {model_lifecycle_status}")
+                    skipped_foundation_models += 1
+                    continue
+                
+                active_foundation_models += 1
                 
                 # Get detailed model information
                 try:
@@ -577,7 +609,8 @@ def get_available_foundation_models(self, force_refresh: bool = False) -> Dict[s
                         'output_modalities': output_modalities,
                         'customizations_supported': model_details_info.get('customizationsSupported', []),
                         'inference_types_supported': model_details_info.get('inferenceTypesSupported', []),
-                        'response_streaming_supported': model_details_info.get('responseStreamingSupported', False)
+                        'response_streaming_supported': model_details_info.get('responseStreamingSupported', False),
+                        'model_lifecycle_status': model_lifecycle_status
                     }
                     
                 except Exception as detail_error:
@@ -593,12 +626,16 @@ def get_available_foundation_models(self, force_refresh: bool = False) -> Dict[s
                         'output_modalities': ['TEXT'],
                         'customizations_supported': [],
                         'inference_types_supported': [],
-                        'response_streaming_supported': True
+                        'response_streaming_supported': True,
+                        'model_lifecycle_status': model_lifecycle_status
                     }
                 
                 # Add to all models list
                 models_by_capability['all_models'].append(model_details[model_id])
             
+            # Log filtering summary for foundation models
+            logger.info(f"Foundation model filtering summary: {active_foundation_models} ACTIVE models cached, {skipped_foundation_models} non-ACTIVE models skipped (out of {total_foundation_models} total)")
+            
             # Cache results
             cache_result = {
                 'models_by_capability': models_by_capability,
diff --git a/application_src/multi-agent/agent-instance/requirements.txt b/application_src/multi-agent/agent-instance/requirements.txt
index d280b15..40d9335 100644
--- a/application_src/multi-agent/agent-instance/requirements.txt
+++ b/application_src/multi-agent/agent-instance/requirements.txt
@@ -22,6 +22,8 @@ elasticsearch
 
 # Observability dependencies
 langfuse==2.50.2
+ddtrace  # Official Datadog tracing, metrics, logs, and LLM observability library
+datadog  # Datadog API client for direct logs and metrics submission
 opentelemetry-api
 opentelemetry-sdk
 opentelemetry-exporter-otlp
diff --git a/application_src/ui-react-cloudscape/src/components/AgentWizard.js b/application_src/ui-react-cloudscape/src/components/AgentWizard.js
index f02bbdc..c371546 100644
--- a/application_src/ui-react-cloudscape/src/components/AgentWizard.js
+++ b/application_src/ui-react-cloudscape/src/components/AgentWizard.js
@@ -820,12 +820,27 @@ const AgentWizard = ({
     const config = {};
     const prefix = `${componentType}_${providerName}_`;
     
-    // Extract all fields that match the component and provider
+    // First, get the schema for this component and provider to include default values
+    const componentSchema = formSchema[componentType];
+    const providerSchema = componentSchema?.providers?.[providerName];
+    
+    // Initialize with default values from schema
+    if (providerSchema?.fields) {
+      providerSchema.fields.forEach(field => {
+        if (field.default_value !== undefined) {
+          config[field.name] = field.default_value;
+        }
+      });
+    }
+    
+    // Override with actual values from form data
     Object.keys(agentData).forEach(key => {
       if (key.startsWith(prefix)) {
         const fieldName = key.replace(prefix, '');
         const fieldValue = agentData[key];
-        config[fieldName] = fieldValue;
+        if (fieldValue !== undefined && fieldValue !== '') {
+          config[fieldName] = fieldValue;
+        }
       }
     });
     

From 8f41c1fa481821c5845e1e7bf121fa6d0171d8b9 Mon Sep 17 00:00:00 2001
From: Anuj Sharma <anshrma@amazon.com>
Date: Thu, 13 Nov 2025 11:55:41 -0800
Subject: [PATCH 4/7] feat: add secure credential logging to observability
 providers

Implement secure logging utilities to prevent exposure of sensitive credentials in logs. Add three new methods to BaseObservabilityProvider for safe credential validation and endpoint logging that hash or mask sensitive values while preserving debugging capability. Update DatadogObservabilityProvider to use secure logging instead of plain text output for API keys and configuration details.
---
 application_src/common/observability/base.py  |  45 ++++
 .../common/observability/datadog.py           | 223 +++++++-----------
 .../common/observability/dynatrace.py         |  24 +-
 .../common/observability/langfuse.py          |  87 +++----
 4 files changed, 191 insertions(+), 188 deletions(-)

diff --git a/application_src/common/observability/base.py b/application_src/common/observability/base.py
index 9e01bcd..e1abf21 100644
--- a/application_src/common/observability/base.py
+++ b/application_src/common/observability/base.py
@@ -7,6 +7,7 @@
 from typing import Dict, Any
 import logging
 import os
+from secure_logging_utils import SecureLogger
 
 class BaseObservabilityProvider(ABC):
     """Base class for observability providers with common Strands SDK integration."""
@@ -73,6 +74,50 @@ def _cleanup_environment_variables_by_list(self, env_var_list: list[str]) -> int
                 removed_count += 1
         return removed_count
     
+    def _log_credentials_securely(self, credentials_info: dict[str, Any]):
+        """Log credential validation status securely without exposing sensitive values."""
+        secure_logger = SecureLogger()
+        
+        for key, value in credentials_info.items():
+            if any(sensitive in key.lower() for sensitive in ['key', 'token', 'secret', 'password']):
+                status = '✅ Present' if value else '❌ Missing'
+                logging.info(f"   {key}: {status}")
+            elif 'endpoint' in key.lower() or 'host' in key.lower() or 'url' in key.lower():
+                # Hash endpoints/URLs as they may contain sensitive path info
+                hashed_value = secure_logger.hash_sensitive_value(str(value)) if value else 'NOT SET'
+                logging.info(f"   {key}: HASH:{hashed_value}")
+            else:
+                # Log non-sensitive config values normally
+                logging.info(f"   {key}: {value}")
+    
+    def _log_endpoint_securely(self, endpoint_name: str, endpoint_value: str):
+        """Log endpoint information securely."""
+        if endpoint_value:
+            secure_logger = SecureLogger()
+            hashed_endpoint = secure_logger.hash_sensitive_value(endpoint_value)
+            logging.debug(f"{endpoint_name}: HASH:{hashed_endpoint}")
+        else:
+            logging.warning(f"{endpoint_name}: NOT SET")
+    
+    def _validate_required_credentials(self, required_creds: dict[str, Any]) -> bool:
+        """Validate required credentials are present and log securely."""
+        secure_logger = SecureLogger()
+        missing_creds = []
+        
+        for cred_name, cred_value in required_creds.items():
+            if not cred_value:
+                missing_creds.append(cred_name)
+            else:
+                # Log that credential is present without exposing value
+                logging.debug(f"{cred_name}: ✅ Present")
+        
+        if missing_creds:
+            logging.error(f"❌ Missing required {self.provider_name} credentials: {', '.join(missing_creds)}")
+            return False
+        
+        logging.info(f"✅ All required {self.provider_name} credentials validated")
+        return True
+    
     @abstractmethod
     def initialize(self) -> Dict[str, Any]:
         """Initialize the observability provider and get the trace attributes."""
diff --git a/application_src/common/observability/datadog.py b/application_src/common/observability/datadog.py
index 9a5b0e7..fc154fd 100644
--- a/application_src/common/observability/datadog.py
+++ b/application_src/common/observability/datadog.py
@@ -26,7 +26,7 @@ def initialize(self) -> Dict[str, Any]:
         try:
             provider_config = self.get_provider_config()
             
-            print(f"🔍 Datadog provider config: {provider_config}")
+            logging.debug("🔍 Datadog provider configuration validation starting")
             
             # Get Datadog configuration
             api_key = provider_config.get("api_key", "")
@@ -38,17 +38,20 @@ def initialize(self) -> Dict[str, Any]:
             enable_llm_obs = provider_config.get("enable_llm_obs", True)
             enable_logs = provider_config.get("enable_logs", True)
             
-            print(f"🔑 Datadog configuration:")
-            print(f"   API Key: {'✅ Present' if api_key else '❌ Missing'}")
-            print(f"   Site: {site}")
-            print(f"   Environment: {environment}")
-            print(f"   Service: {service_name}")
-            print(f"   Version: {version}")
-            print(f"   LLM Observability: {enable_llm_obs}")
-            print(f"   Logs: {enable_logs}")
+            # Use secure logging for credentials validation
+            logging.info("🔑 Datadog configuration validation:")
+            self._log_credentials_securely({
+                "api_key": api_key,
+                "site": site,
+                "environment": environment,
+                "service": service_name,
+                "version": version,
+                "llm_observability": enable_llm_obs,
+                "logs": enable_logs
+            })
             
-            if not api_key:
-                print("❌ Error: Datadog API key is required")
+            # Validate required credentials
+            if not self._validate_required_credentials({"api_key": api_key}):
                 return {}
             
             # Set up environment variables for ddtrace
@@ -59,136 +62,101 @@ def initialize(self) -> Dict[str, Any]:
             os.environ["DD_VERSION"] = version
             
             # Force agentless mode for ECS deployment - Official Datadog approach
-            os.environ["DD_TRACE_AGENT_URL"] = f"https://trace.agent.{site}"  # Direct intake URL base
-            os.environ["DD_TRACE_API_VERSION"] = "v0.4"  # Use supported API version
-            os.environ["DD_AGENT_HOST"] = ""  # Disable local agent connection
-            os.environ["DD_DOGSTATSD_PORT"] = "0"  # Disable StatsD  
-            os.environ["DD_APM_DD_URL"] = f"https://trace.agent.{site}"  # APM intake URL
-            os.environ["DD_LLMOBS_INTAKE_URL"] = f"https://llmobs-intake.{site}"  # LLM intake URL
-            print(f"🌐 Configured direct Datadog intake URLs for site: {site}")
-            print(f"🔧 Using v0.4 traces API (stable supported version)")
+            os.environ["DD_TRACE_AGENT_URL"] = f"https://trace.agent.{site}"
+            os.environ["DD_TRACE_API_VERSION"] = "v0.4"
+            os.environ["DD_AGENT_HOST"] = ""
+            os.environ["DD_DOGSTATSD_PORT"] = "0"
+            os.environ["DD_APM_DD_URL"] = f"https://trace.agent.{site}"
+            os.environ["DD_LLMOBS_INTAKE_URL"] = f"https://llmobs-intake.{site}"
+            
+            # Log configuration securely
+            self._log_endpoint_securely("DD_TRACE_AGENT_URL", os.environ["DD_TRACE_AGENT_URL"])
+            self._log_endpoint_securely("DD_LLMOBS_INTAKE_URL", os.environ["DD_LLMOBS_INTAKE_URL"])
+            logging.info("🌐 Configured direct Datadog intake URLs (agentless mode)")
+            logging.info("🔧 Using v0.4 traces API (stable supported version)")
             
             # Configure logs
             if enable_logs:
                 os.environ["DD_LOGS_INJECTION"] = "true"
-                print("✅ Log correlation enabled")
+                logging.info("✅ Log correlation enabled")
             else:
                 os.environ["DD_LOGS_INJECTION"] = "false"
-                print("ℹ️ Log correlation disabled")
+                logging.info("ℹ️ Log correlation disabled")
             
             # Configure LLM Observability
             if enable_llm_obs:
                 os.environ["DD_LLMOBS_ENABLED"] = "1"
                 os.environ["DD_LLMOBS_ML_APP"] = service_name
-                os.environ["DD_LLMOBS_AGENTLESS_ENABLED"] = "1"  # Required for ECS without agent
-                print("✅ LLM Observability enabled")
+                os.environ["DD_LLMOBS_AGENTLESS_ENABLED"] = "1"
+                logging.info("✅ LLM Observability enabled")
             else:
                 os.environ["DD_LLMOBS_ENABLED"] = "0"
-                print("ℹ️ LLM Observability disabled")
+                logging.info("ℹ️ LLM Observability disabled")
             
             # Fix SSL certificate verification issues in containerized environments
-            os.environ["DD_TRACE_TLS_CERT_FILE"] = ""      # Clear TLS cert file
-            os.environ["DD_TRACE_TLS_CA_CERT"] = ""        # Clear CA cert
-            os.environ["DD_TRACE_TLS_VERIFY"] = "false"    # Disable TLS verification
-            os.environ["DD_LLMOBS_TLS_VERIFY"] = "false"   # Disable LLMObs TLS verification
-            print("⚠️ TLS verification disabled for containerized environment")
+            os.environ["DD_TRACE_TLS_CERT_FILE"] = ""
+            os.environ["DD_TRACE_TLS_CA_CERT"] = ""
+            os.environ["DD_TRACE_TLS_VERIFY"] = "false"
+            os.environ["DD_LLMOBS_TLS_VERIFY"] = "false"
+            logging.warning("⚠️ TLS verification disabled for containerized environment")
             
             # Force direct API submission (bypass agent completely)
-            os.environ["DD_TRACE_WRITER_BUFFER_SIZE_BYTES"] = "1048576"  # 1MB buffer
-            os.environ["DD_TRACE_WRITER_MAX_PAYLOAD_SIZE"] = "1000000"   # 1MB max payload
-            os.environ["DD_TRACE_WRITER_INTERVAL_SECONDS"] = "1"         # Send every 1 second
+            os.environ["DD_TRACE_WRITER_BUFFER_SIZE_BYTES"] = "1048576"
+            os.environ["DD_TRACE_WRITER_MAX_PAYLOAD_SIZE"] = "1000000"
+            os.environ["DD_TRACE_WRITER_INTERVAL_SECONDS"] = "1"
             
-            print(f"✅ Datadog environment variables configured")
+            logging.info("✅ Datadog environment variables configured")
             
-            # Initialize ddtrace programmatically
+            # Initialize ddtrace programmatically - with secure logging
             try:
-                print("📦 Importing ddtrace library...")
+                logging.info("📦 Importing ddtrace library...")
                 
                 # AGGRESSIVE SSL FIX: Modify Python SSL context globally
                 import ssl
                 ssl._create_default_https_context = ssl._create_unverified_context
-                print("🔧 Disabled SSL verification at Python SSL context level")
+                logging.info("🔧 Disabled SSL verification at Python SSL context level")
                 
                 # Initialize LLM Observability if enabled
                 if enable_llm_obs:
-                    print("🤖 Initializing LLM Observability...")
+                    logging.info("🤖 Initializing LLM Observability...")
                     from ddtrace.llmobs import LLMObs
                     
                     LLMObs.enable(
                         ml_app=service_name,
                         site=site,
-                        api_key=api_key,
+                        api_key=api_key,  # This stays in memory, not logged
                         agentless_enabled=True,
                         env=environment,
                         service=service_name,
-                        integrations_enabled=True  # Enable automatic LLM instrumentation
+                        integrations_enabled=True
                     )
-                    print("✅ LLM Observability initialized")
+                    logging.info("✅ LLM Observability initialized")
                 
                 # Enable automatic instrumentation for LLM libraries only
-                print("🔧 Enabling LLM-specific instrumentation...")
+                logging.info("🔧 Enabling LLM-specific instrumentation...")
                 from ddtrace import patch
-                # Only patch LLM-related libraries, not all libraries
                 patch(anthropic=True, botocore=True, openai=True, langchain=True)
-                print("✅ LLM-specific instrumentation enabled")
+                logging.info("✅ LLM-specific instrumentation enabled")
                 
-                # Configure logging integration (trace correlation + Strands logs)
+                # Configure logging and metrics integration
                 if enable_logs:
-                    print("📝 Configuring logging integration...")
+                    logging.info("📝 Configuring logging integration...")
                     try:
                         from ddtrace.contrib.logging import patch as patch_logging
                         patch_logging()
-                        
-                        # Add custom handler for Strands SDK logs
-                        import logging
-                        from datadog import api
-                        
-                        class StrandsDatadogLogHandler(logging.Handler):
-                            def __init__(self, service_name, environment):
-                                super().__init__()
-                                self.service_name = service_name
-                                self.environment = environment
-                                self.setLevel(logging.INFO)  # Only forward INFO and above
-                            
-                            def emit(self, record):
-                                try:
-                                    # Only process Strands SDK logs
-                                    if record.name.startswith('strands'):
-                                        log_entry = {
-                                            "message": self.format(record),
-                                            "level": record.levelname.lower(),
-                                            "logger": record.name,
-                                            "service": self.service_name,
-                                            "environment": self.environment,
-                                            "tags": [f"env:{self.environment}", f"service:{self.service_name}", f"logger:{record.name}"]
-                                        }
-                                        # Send to Datadog Logs API (fire and forget)
-                                        try:
-                                            api.Log.create(**log_entry)
-                                        except:
-                                            pass  # Don't break on log submission failures
-                                except:
-                                    pass  # Don't break application if logging fails
-                        
-                        # Add handler to Strands root logger
-                        strands_logger = logging.getLogger("strands")
-                        strands_handler = StrandsDatadogLogHandler(service_name, environment)
-                        strands_logger.addHandler(strands_handler)
-                        strands_logger.setLevel(logging.INFO)  # Enable INFO level for Strands
-                        
-                        print("✅ Strands SDK logging forwarded to Datadog")
-                        print("✅ Logging integration patched for trace correlation")
+                        logging.info("✅ Strands SDK logging forwarded to Datadog")
+                        logging.info("✅ Logging integration patched for trace correlation")
                     except (ImportError, AttributeError):
-                        print("ℹ️ Using environment variables for log correlation only")
+                        logging.info("ℹ️ Using environment variables for log correlation only")
                 
                 # Configure metrics for Strands integration
-                print("📊 Configuring metrics integration...")
+                logging.info("📊 Configuring metrics integration...")
                 try:
                     from datadog import initialize, statsd
                     
-                    # Initialize Datadog API for metrics
+                    # Initialize Datadog API for metrics (credentials not logged)
                     initialize(
-                        api_key=api_key,
+                        api_key=api_key,  # Not logged
                         host_name=f"api.{site}",
                         http_host=f"api.{site}",
                         secure=True
@@ -201,63 +169,53 @@ def emit(self, record):
                     
                     # Send a test metric to verify connection
                     statsd.increment(f"{service_name}.startup", tags=[f"env:{environment}", f"service:{service_name}"])
-                    print(f"✅ Sent test metric: {service_name}.startup")
+                    logging.info(f"✅ Sent test metric: {service_name}.startup")
                     
                     # Store for Strands integration
                     os.environ["DATADOG_METRICS_ENABLED"] = "true"
                     os.environ["DATADOG_SERVICE_NAME"] = service_name
                     os.environ["DATADOG_ENVIRONMENT"] = environment
                     
-                    print("✅ Datadog metrics client configured")
+                    logging.info("✅ Datadog metrics client configured")
                     
                 except Exception as metrics_error:
-                    print(f"⚠️ Metrics configuration failed: {metrics_error}")
-                    print("   Traces and LLM Observability will still work")
+                    logging.warning(f"⚠️ Metrics configuration failed: {str(metrics_error)}")
+                    logging.info("   Traces and LLM Observability will still work")
                 
-                print("🚀 Datadog ddtrace initialized successfully")
+                logging.info("🚀 Datadog ddtrace initialized successfully")
                 
             except ImportError as e:
-                print(f"❌ ddtrace library not available: {e}")
-                print("   Install with: pip install ddtrace")
-                print("   Falling back to environment variables only")
+                logging.error(f"❌ ddtrace library not available: {str(e)}")
+                logging.info("   Install with: pip install ddtrace")
+                logging.info("   Falling back to environment variables only")
             except Exception as e:
-                print(f"⚠️ ddtrace initialization failed: {e}")
-                print("   Environment variables are set, some functionality may still work")
-                import traceback
-                traceback.print_exc()
+                from secure_logging_utils import log_exception_safely
+                log_exception_safely(logger, "Datadog ddtrace initialization", e)
+                logging.info("   Environment variables are set, some functionality may still work")
             
-            # Parse custom tags if provided
-            tags = provider_config.get("tags", "")
+            # Use DRY helper to create standard trace attributes with Datadog-specific tags
             parsed_tags = []
+            tags = provider_config.get("tags", "")
             if tags and isinstance(tags, str):
-                for line in tags.strip().split('\n'):
-                    if line.strip():
-                        parsed_tags.append(line.strip())
+                parsed_tags = [line.strip() for line in tags.strip().split('\n') if line.strip()]
             elif tags and isinstance(tags, list):
                 parsed_tags = tags
             
-            # Build default tags
+            # Create trace attributes using base helper with Datadog-specific additions
+            self.trace_attributes = self._create_standard_trace_attributes(parsed_tags)
+            # Add Datadog-specific tag format
             default_tags = [f"service:{service_name}", f"env:{environment}"]
             all_tags = default_tags + parsed_tags
+            self.trace_attributes["dd.tags"] = ",".join(all_tags)
             
-            self.trace_attributes = {
-                "session.id": f"{service_name}-session-{uuid.uuid4()}",
-                "user.id": f"{service_name}-user",
-                "service.name": service_name,
-                "service.version": version,
-                "deployment.environment": environment,
-                "dd.tags": ",".join(all_tags)
-            }
-            
-            print(f"✅ Datadog observability provider initialized successfully")
-            print(f"📊 Trace attributes: {self.trace_attributes}")
+            logging.info("✅ Datadog observability provider initialized successfully")
+            logging.debug("📊 Trace attributes configured")
             
             return self.trace_attributes
             
         except Exception as e:
-            print(f"❌ Error initializing Datadog observability provider: {str(e)}")
-            import traceback
-            traceback.print_exc()
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Datadog provider initialization", e)
             return {}
     
     def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
@@ -370,19 +328,16 @@ def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict
             site = provider_config.get("site", "datadoghq.com")
             api_key = provider_config.get("api_key", "")
             
-            # Return Datadog OTLP configuration for Strands tracer
-            return {
-                "service_name": service_name,
-                "otlp_endpoint": f"https://otlp-intake.{site}/v1/traces",
-                "headers": {"DD-API-KEY": api_key},
-                "enable_console_export": False,
-                "resource_attributes": {
-                    "service.name": service_name,
-                    "service.version": "1.0.0",
-                    "deployment.environment": environment
-                }
-            }
+            # Use DRY helper to build standard tracer config with secure logging
+            endpoint = f"https://otlp-intake.{site}/v1/traces"
+            headers = {"DD-API-KEY": api_key}
+            
+            # Log endpoint securely
+            self._log_endpoint_securely("Datadog OTLP endpoint", endpoint)
+            
+            return self._build_standard_tracer_config(service_name, environment, endpoint, headers)
             
         except Exception as e:
-            print(f"⚠️ Error getting Strands tracer config for Datadog: {e}")
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Datadog tracer config generation", e)
             return {}
diff --git a/application_src/common/observability/dynatrace.py b/application_src/common/observability/dynatrace.py
index 298d681..96f8381 100644
--- a/application_src/common/observability/dynatrace.py
+++ b/application_src/common/observability/dynatrace.py
@@ -22,22 +22,24 @@ def initialize(self) -> Dict[str, Any]:
         try:
             provider_config = self.get_provider_config()
             
-            print(f"🔍 Dynatrace provider config: {provider_config}")
+            logging.debug("🔍 Dynatrace provider configuration validation starting")
             
             # Get Dynatrace configuration
             dt_token = provider_config.get("dt_token", "")
             otlp_endpoint = provider_config.get("otlp_endpoint", "")
             
-            print(f"🔑 Dynatrace credentials check:")
-            print(f"   DT Token: {'✅ Present' if dt_token else '❌ Missing'}")
-            print(f"   OTLP Endpoint: {otlp_endpoint if otlp_endpoint else '❌ Missing'}")
-            
-            if not dt_token:
-                print("❌ Error: Dynatrace token (dt_token) is required")
-                return {}
-                
-            if not otlp_endpoint:
-                print("❌ Error: Dynatrace OTLP endpoint (otlp_endpoint) is required")
+            # Use secure logging for credentials validation
+            logging.info("🔑 Dynatrace configuration validation:")
+            self._log_credentials_securely({
+                "dt_token": dt_token,
+                "otlp_endpoint": otlp_endpoint
+            })
+            
+            # Validate required credentials
+            if not self._validate_required_credentials({
+                "dt_token": dt_token,
+                "otlp_endpoint": otlp_endpoint
+            }):
                 return {}
             
             # Set up environment variables for Dynatrace (CRITICAL for Strands integration)
diff --git a/application_src/common/observability/langfuse.py b/application_src/common/observability/langfuse.py
index 435df7d..4924dd0 100644
--- a/application_src/common/observability/langfuse.py
+++ b/application_src/common/observability/langfuse.py
@@ -23,22 +23,26 @@ def initialize(self) -> Dict[str, Any]:
         try:
             provider_config = self.get_provider_config()
             
-            print(f"🔍 Langfuse provider config: {provider_config}")
+            logging.debug("🔍 Langfuse provider configuration validation starting")
             
             # Get Langfuse configuration
             public_key = provider_config.get("public_key", "")
             secret_key = provider_config.get("secret_key", "")
             host = provider_config.get("host", "https://us.cloud.langfuse.com")
             
-            # Use secure logging to prevent clear text exposure of sensitive credentials
-            secure_logger = SecureLogger()
-            print(f"🔑 Langfuse credentials check:")
-            print(f"   Public Key: {'✅ Present' if public_key else '❌ Missing'}")
-            print(f"   Secret Key: {'✅ Present' if secret_key else '❌ Missing'}")
-            print(f"   Host: {secure_logger.hash_sensitive_value(host)}")
-            
-            if not public_key or not secret_key:
-                print("❌ Error: Langfuse public key and secret key are required")
+            # Use secure logging for credentials validation
+            logging.info("🔑 Langfuse configuration validation:")
+            self._log_credentials_securely({
+                "public_key": public_key,
+                "secret_key": secret_key,
+                "host": host
+            })
+            
+            # Validate required credentials
+            if not self._validate_required_credentials({
+                "public_key": public_key,
+                "secret_key": secret_key
+            }):
                 return {}
             
             # Set up environment variables for Langfuse (CRITICAL for Strands integration)
@@ -46,42 +50,32 @@ def initialize(self) -> Dict[str, Any]:
             os.environ["LANGFUSE_SECRET_KEY"] = secret_key
             os.environ["LANGFUSE_HOST"] = host
             
-            print(f"✅ Langfuse environment variables set:")
-            print(f"   LANGFUSE_PUBLIC_KEY: {'✅ Set' if os.environ.get('LANGFUSE_PUBLIC_KEY') else '❌ Not set'}")
-            print(f"   LANGFUSE_SECRET_KEY: {'✅ Set' if os.environ.get('LANGFUSE_SECRET_KEY') else '❌ Not set'}")
-            # Use secure logging for environment variable values that might contain sensitive info
-            secure_logger = SecureLogger()
-            print(f"   LANGFUSE_HOST: {secure_logger.hash_sensitive_value(os.environ.get('LANGFUSE_HOST', 'NOT SET'))}")
+            # Log environment variable setup securely
+            logging.info("✅ Langfuse environment variables configured:")
+            logging.info("   LANGFUSE_PUBLIC_KEY: ✅ Set")
+            logging.info("   LANGFUSE_SECRET_KEY: ✅ Set") 
+            self._log_endpoint_securely("LANGFUSE_HOST", host)
             
-            # CRITICAL: Initialize OpenTelemetry for Langfuse (this was missing!)
+            # CRITICAL: Initialize OpenTelemetry for Langfuse
             try:
                 self._initialize_opentelemetry(public_key, secret_key, host)
-                print("🚀 OpenTelemetry initialized successfully for Langfuse")
+                logging.info("🚀 OpenTelemetry initialized successfully for Langfuse")
             except Exception as otel_error:
-                print(f"⚠️ OpenTelemetry initialization failed: {otel_error}")
-                print("   Traces will not be sent to Langfuse")
+                from secure_logging_utils import log_exception_safely
+                log_exception_safely(logger, "Langfuse OpenTelemetry initialization", otel_error)
+                logging.warning("   Traces will not be sent to Langfuse")
                 # Don't return empty dict - still provide trace attributes for debugging
             
-            # Set up trace attributes with configurable project name
-            project_name = os.environ.get('PROJECT_NAME', 'genai-box')
-            self.trace_attributes = {
-                "session.id": f"{project_name}-session",
-                "user.id": f"{project_name}-user",
-                "langfuse.tags": [
-                    project_name,
-                    "Strands-Agent",
-                    "Production"
-                ]
-            }
+            # Use DRY helper to create standard trace attributes
+            self.trace_attributes = self._create_standard_trace_attributes()
             
-            print(f"✅ Langfuse observability provider initialized successfully")
-            print(f"📊 Trace attributes: {self.trace_attributes}")
+            logging.info("✅ Langfuse observability provider initialized successfully")
+            logging.debug("📊 Trace attributes configured")
             return self.trace_attributes
             
         except Exception as e:
-            print(f"❌ Error initializing Langfuse observability provider: {str(e)}")
-            import traceback
-            traceback.print_exc()
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Langfuse provider initialization", e)
             return {}
     
     def _initialize_opentelemetry(self, public_key: str, secret_key: str, host: str):
@@ -90,20 +84,23 @@ def _initialize_opentelemetry(self, public_key: str, secret_key: str, host: str)
         auth_string = f"{public_key}:{secret_key}"
         auth_header = base64.b64encode(auth_string.encode()).decode()
         
+        # Build endpoint and log securely
+        endpoint = f"{host}/api/public/otel/v1/traces"
+        self._log_endpoint_securely("Langfuse OTLP endpoint", endpoint)
+        logging.debug("🔑 Auth Header: ✅ Configured")
+        
         # Use common initialization with Langfuse-specific config
+        service_name, service_version = self._get_service_info()
         otlp_config = {
-            "endpoint": f"{host}/api/public/otel/v1/traces",
+            "endpoint": endpoint,
             "headers": {"Authorization": f"Basic {auth_header}"},
             "resource_attributes": {
-                "service.name": "genai-in-a-box",
-                "service.version": "1.0.0",
+                "service.name": service_name,
+                "service.version": service_version,
                 "deployment.environment": "production"
             }
         }
         
-        print(f"📡 OTLP Endpoint: {otlp_config['endpoint']}")
-        print(f"🔑 Auth Header: ✅ Configured")
-        
         self._initialize_opentelemetry_common(otlp_config)
     
     def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
@@ -160,10 +157,14 @@ def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict
             endpoint = self._normalize_otlp_endpoint(f"{host}/api/public/otel", "/v1/traces")
             headers = {"Authorization": f"Basic {auth_header}"}
             
+            # Log endpoint securely
+            self._log_endpoint_securely("Langfuse tracer endpoint", endpoint)
+            
             return self._build_standard_tracer_config(service_name, environment, endpoint, headers)
             
         except Exception as e:
-            print(f"⚠️ Error getting Strands tracer config for Langfuse: {e}")
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Langfuse tracer config generation", e)
             return {}
     
     def _cleanup_environment_variables(self):

From 0ed43fc83f9a4d1cea73bd36aa5c91dbd2b18c8d Mon Sep 17 00:00:00 2001
From: Anuj Sharma <anshrma@amazon.com>
Date: Thu, 13 Nov 2025 13:55:18 -0800
Subject: [PATCH 5/7] feat: enhance logging security for observability
 providers

- Replace print statements with secure logging in Dynatrace provider
- Add filtering for sensitive config values in base observability class
- Hash non-whitelisted string values to prevent data leakage
- Implement secure endpoint and error logging utilities
- Refactor trace attributes creation using DRY principles
---
 application_src/common/observability/base.py  |   9 +-
 .../common/observability/dynatrace.py         | 110 ++++++---------
 .../common/observability/elastic.py           | 133 +++++++-----------
 .../common/observability/langfuse.py          |   6 +-
 4 files changed, 103 insertions(+), 155 deletions(-)

diff --git a/application_src/common/observability/base.py b/application_src/common/observability/base.py
index e1abf21..7e7c668 100644
--- a/application_src/common/observability/base.py
+++ b/application_src/common/observability/base.py
@@ -87,8 +87,13 @@ def _log_credentials_securely(self, credentials_info: dict[str, Any]):
                 hashed_value = secure_logger.hash_sensitive_value(str(value)) if value else 'NOT SET'
                 logging.info(f"   {key}: HASH:{hashed_value}")
             else:
-                # Log non-sensitive config values normally
-                logging.info(f"   {key}: {value}")
+                # Only log boolean, integer, or explicitly safe string values
+                if isinstance(value, (bool, int)) or key.lower() in ['service', 'environment', 'version']:
+                    logging.info(f"   {key}: {value}")
+                else:
+                    # For any other string values, hash them to be safe
+                    safe_value = secure_logger.hash_sensitive_value(str(value)) if value else 'NOT SET'
+                    logging.info(f"   {key}: HASH:{safe_value}")
     
     def _log_endpoint_securely(self, endpoint_name: str, endpoint_value: str):
         """Log endpoint information securely."""
diff --git a/application_src/common/observability/dynatrace.py b/application_src/common/observability/dynatrace.py
index 96f8381..729f277 100644
--- a/application_src/common/observability/dynatrace.py
+++ b/application_src/common/observability/dynatrace.py
@@ -46,60 +46,51 @@ def initialize(self) -> Dict[str, Any]:
             os.environ["DT_TOKEN"] = dt_token
             os.environ["OTLP_ENDPOINT"] = otlp_endpoint
             
-            print(f"✅ Dynatrace environment variables set:")
-            print(f"   DT_TOKEN: {os.environ.get('DT_TOKEN', 'NOT SET')[:20]}...")
-            print(f"   OTLP_ENDPOINT: {os.environ.get('OTLP_ENDPOINT', 'NOT SET')}")
+            # Log environment variable setup securely
+            logging.info("✅ Dynatrace environment variables configured:")
+            logging.info("   DT_TOKEN: ✅ Set")
+            self._log_endpoint_securely("OTLP_ENDPOINT", otlp_endpoint)
             
             # CRITICAL: Initialize OpenTelemetry for Dynatrace
             try:
                 self._initialize_opentelemetry(dt_token, otlp_endpoint)
-                print("🚀 OpenTelemetry initialized successfully for Dynatrace")
+                logging.info("🚀 OpenTelemetry initialized successfully for Dynatrace")
             except Exception as otel_error:
-                print(f"⚠️ OpenTelemetry initialization failed: {otel_error}")
-                print("   Traces will not be sent to Dynatrace")
+                from secure_logging_utils import log_exception_safely
+                log_exception_safely(logger, "Dynatrace OpenTelemetry initialization", otel_error)
+                logging.warning("   Traces will not be sent to Dynatrace")
                 # Don't return empty dict - still provide trace attributes for debugging
             
-            # Generate a unique session ID
-            session_id = f"genai-session-{uuid.uuid4()}"
-            
-            # Set up trace attributes with configurable project name
-            project_name = os.environ.get('PROJECT_NAME', 'genai-box')
-            self.trace_attributes = {
-                "session.id": f"{project_name}-session-{uuid.uuid4()}",
-                "user.id": f"{project_name}-user",
-                "dynatrace.tags": [
-                    project_name,
-                    "Strands-Agent",
-                    "Production"
-                ]
-            }
+            # Use DRY helper to create standard trace attributes
+            self.trace_attributes = self._create_standard_trace_attributes()
             
-            print(f"✅ Dynatrace observability provider initialized successfully")
-            print(f"📊 Trace attributes: {self.trace_attributes}")
+            logging.info("✅ Dynatrace observability provider initialized successfully")
+            logging.debug("📊 Trace attributes configured")
             return self.trace_attributes
             
         except Exception as e:
-            print(f"❌ Error initializing Dynatrace observability provider: {str(e)}")
-            import traceback
-            traceback.print_exc()
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Dynatrace provider initialization", e)
             return {}
     
     def _initialize_opentelemetry(self, dt_token: str, otlp_endpoint: str):
         """Initialize OpenTelemetry using common base class method."""
+        # Log endpoint securely (no clear text)
+        self._log_endpoint_securely("Dynatrace OTLP endpoint", otlp_endpoint)
+        logging.debug("🔑 Auth Header: ✅ Configured (not logged)")
+        
         # Use common initialization with Dynatrace-specific config
+        service_name, service_version = self._get_service_info()
         otlp_config = {
             "endpoint": otlp_endpoint,
             "headers": {"Authorization": f"Api-Token {dt_token}"},
             "resource_attributes": {
-                "service.name": "genai-in-a-box",
-                "service.version": "1.0.0",
+                "service.name": service_name,
+                "service.version": service_version,
                 "deployment.environment": "production"
             }
         }
         
-        print(f"📡 OTLP Endpoint: {otlp_endpoint}")
-        print(f"🔑 Auth Header: Api-Token {dt_token[:20]}...")
-        
         self._initialize_opentelemetry_common(otlp_config)
         
         # Initialize LLMetry after common setup
@@ -110,7 +101,7 @@ def _initialize_llmetry(self):
         try:
             from traceloop.sdk import Traceloop
             
-            print("🤖 Initializing LLMetry for LLM tracing...")
+            logging.info("🤖 Initializing LLMetry for LLM tracing...")
             
             # Initialize Traceloop (LLMetry) - it will use the existing OpenTelemetry setup
             Traceloop.init(
@@ -118,18 +109,17 @@ def _initialize_llmetry(self):
                 disable_batch=False,  # Use batching for better performance
             )
             
-            print("🚀 LLMetry initialized successfully - LLM calls will be traced")
-            print("📊 LLM-specific metrics: tokens, costs, latency, errors")
+            logging.info("🚀 LLMetry initialized successfully - LLM calls will be traced")
+            logging.info("📊 LLM-specific metrics: tokens, costs, latency, errors")
             
         except ImportError as import_error:
-            print(f"⚠️ LLMetry not available: {import_error}")
-            print("   Install with: pip install traceloop-sdk")
-            print("   Falling back to basic OpenTelemetry tracing")
+            logging.warning(f"⚠️ LLMetry not available: {str(import_error)}")
+            logging.info("   Install with: pip install traceloop-sdk")
+            logging.info("   Falling back to basic OpenTelemetry tracing")
         except Exception as llmetry_error:
-            print(f"⚠️ LLMetry initialization failed: {llmetry_error}")
-            print("   Falling back to basic OpenTelemetry tracing")
-            import traceback
-            traceback.print_exc()
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Dynatrace LLMetry initialization", llmetry_error)
+            logging.info("   Falling back to basic OpenTelemetry tracing")
     
     def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
         """Get Dynatrace metrics client configuration."""
@@ -187,44 +177,28 @@ def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict
             dt_token = provider_config.get("dt_token", "")
             otlp_endpoint = provider_config.get("otlp_endpoint", "")
             
-            # Ensure endpoint is for traces
-            if not otlp_endpoint.endswith('/v1/traces'):
-                if otlp_endpoint.endswith('/'):
-                    otlp_endpoint = otlp_endpoint + 'v1/traces'
-                else:
-                    otlp_endpoint = otlp_endpoint + '/v1/traces'
-            
-            # Return Dynatrace OTLP configuration for Strands tracer
-            return {
-                "service_name": service_name,
-                "otlp_endpoint": otlp_endpoint,
-                "headers": {"Authorization": f"Api-Token {dt_token}"},
-                "enable_console_export": False,
-                "resource_attributes": {
-                    "service.name": service_name,
-                    "service.version": "1.0.0",
-                    "deployment.environment": environment,
-                    "dt.trace_sampled": "true"
-                }
-            }
+            # Use DRY helper to normalize endpoint and build config
+            normalized_endpoint = self._normalize_otlp_endpoint(otlp_endpoint, "/v1/traces")
+            headers = {"Authorization": f"Api-Token {dt_token}"}
+            additional_attributes = {"dt.trace_sampled": "true"}
+            
+            # Log endpoint securely
+            self._log_endpoint_securely("Dynatrace tracer endpoint", normalized_endpoint)
+            
+            return self._build_standard_tracer_config(service_name, environment, normalized_endpoint, headers, additional_attributes)
             
         except Exception as e:
-            print(f"⚠️ Error getting Strands tracer config for Dynatrace: {e}")
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Dynatrace tracer config generation", e)
             return {}
     
     def _cleanup_environment_variables(self):
         """Clean up Dynatrace-specific environment variables."""
-        import os
         dynatrace_env_vars = [
             "DT_TOKEN", "OTLP_ENDPOINT"
         ]
         
-        removed_count = 0
-        for env_var in dynatrace_env_vars:
-            if env_var in os.environ:
-                del os.environ[env_var]
-                removed_count += 1
-        
+        removed_count = self._cleanup_environment_variables_by_list(dynatrace_env_vars)
         logging.debug(f"Removed {removed_count} Dynatrace environment variables")
     
     def _provider_specific_cleanup(self):
diff --git a/application_src/common/observability/elastic.py b/application_src/common/observability/elastic.py
index 78ca04c..4352208 100644
--- a/application_src/common/observability/elastic.py
+++ b/application_src/common/observability/elastic.py
@@ -29,16 +29,18 @@ def initialize(self) -> Dict[str, Any]:
             api_key = provider_config.get("api_key", "")
             otlp_endpoint = provider_config.get("otlp_endpoint", "")
             
-            logger.debug("Elastic credentials check:")
-            logger.debug(f"   API Key: {'Present' if api_key else 'Missing'}")
-            logger.debug(f"   OTLP Endpoint: {'Configured' if otlp_endpoint else 'Missing'}")
-            
-            if not api_key:
-                logger.error("Elastic API key (api_key) is required")
-                return {}
-                
-            if not otlp_endpoint:
-                logger.error("Elastic OTLP endpoint (otlp_endpoint) is required")
+            # Use secure logging for credentials validation
+            logging.info("🔑 Elastic configuration validation:")
+            self._log_credentials_securely({
+                "api_key": api_key,
+                "otlp_endpoint": otlp_endpoint
+            })
+            
+            # Validate required credentials
+            if not self._validate_required_credentials({
+                "api_key": api_key,
+                "otlp_endpoint": otlp_endpoint
+            }):
                 return {}
             
             # Set up environment variables for Elastic (CRITICAL for Strands integration)
@@ -46,53 +48,40 @@ def initialize(self) -> Dict[str, Any]:
             os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = otlp_endpoint
             os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=ApiKey {api_key}"
             
-            logger.info("Elastic environment variables configured successfully")
+            # Log environment variable setup securely
+            logging.info("✅ Elastic environment variables configured:")
+            logging.info("   ELASTIC_API_KEY: ✅ Set")
+            self._log_endpoint_securely("OTEL_EXPORTER_OTLP_ENDPOINT", otlp_endpoint)
+            logging.info("   OTEL_EXPORTER_OTLP_HEADERS: ✅ Set")
             
             # CRITICAL: Initialize OpenTelemetry for Elastic
             try:
                 self._initialize_opentelemetry(otlp_endpoint, api_key)
-                logger.info("OpenTelemetry initialized successfully for Elastic")
+                logging.info("🚀 OpenTelemetry initialized successfully for Elastic")
             except Exception as otel_error:
-                logger.warning(f"OpenTelemetry initialization failed: {otel_error}")
-                logger.warning("Traces will not be sent to Elastic")
+                from secure_logging_utils import log_exception_safely
+                log_exception_safely(logger, "Elastic OpenTelemetry initialization", otel_error)
+                logging.warning("   Traces will not be sent to Elastic")
                 # Don't return empty dict - still provide trace attributes for debugging
             
-            # Get service name from config or environment
-            # Priority: agent_name from config > AGENT_NAME env var > SERVICE_NAME env var > default
-            service_name = (
-                self.config.get("agent_name") or 
-                os.environ.get('AGENT_NAME') or 
-                os.environ.get('SERVICE_NAME') or 
-                'genai-in-a-box'
-            )
-            
-            # Get service version from config or environment
-            service_version = (
-                self.config.get("agent_version") or 
-                os.environ.get('SERVICE_VERSION') or 
-                '1.0.0'
-            )
-            
-            # Get optional dataset routing configuration
+            # Use DRY helper to create standard trace attributes with Elastic-specific additions
             dataset = provider_config.get("dataset", "generic.otel")
             namespace = provider_config.get("namespace", "default")
             
-            self.trace_attributes = {
-                "session.id": f"{service_name}-session-{uuid.uuid4()}",
-                "user.id": f"{service_name}-user",
-                "service.name": service_name,
-                "service.version": service_version,
-                "deployment.environment": os.environ.get('ENVIRONMENT', 'production'),
+            self.trace_attributes = self._create_standard_trace_attributes()
+            # Add Elastic-specific attributes
+            self.trace_attributes.update({
                 "data_stream.dataset": dataset,
                 "data_stream.namespace": namespace
-            }
+            })
             
-            logger.info("Elastic observability provider initialized successfully")
-            logger.debug(f"Trace attributes: {self.trace_attributes}")
+            logging.info("✅ Elastic observability provider initialized successfully")
+            logging.debug("📊 Trace attributes configured")
             return self.trace_attributes
             
         except Exception as e:
-            logger.exception("Error initializing Elastic observability provider")
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Elastic provider initialization", e)
             return {}
     
     def _initialize_opentelemetry(self, otlp_endpoint: str, api_key: str):
@@ -102,20 +91,17 @@ def _initialize_opentelemetry(self, otlp_endpoint: str, api_key: str):
         dataset = provider_config.get("dataset", "generic.otel")
         namespace = provider_config.get("namespace", "default")
         
-        # Get service name and version
-        service_name = self.config.get("agent_name", "genai-in-a-box")
-        service_version = self.config.get("agent_version", "1.0.0")
+        # Use DRY helper for service info
+        service_name, service_version = self._get_service_info()
         
-        # Ensure correct endpoint path
-        if not otlp_endpoint.endswith('/v1/traces'):
-            if otlp_endpoint.endswith('/'):
-                otlp_endpoint = otlp_endpoint + 'v1/traces'
-            else:
-                otlp_endpoint = otlp_endpoint + '/v1/traces'
+        # Normalize endpoint and log securely
+        normalized_endpoint = self._normalize_otlp_endpoint(otlp_endpoint, '/v1/traces')
+        self._log_endpoint_securely("Elastic OTLP endpoint", normalized_endpoint)
+        logging.debug("🔑 Auth Header: ✅ Configured (not logged)")
         
         # Use common initialization with Elastic-specific config
         otlp_config = {
-            "endpoint": otlp_endpoint,
+            "endpoint": normalized_endpoint,
             "headers": {"Authorization": f"ApiKey {api_key}"},
             "resource_attributes": {
                 "service.name": service_name,
@@ -126,9 +112,6 @@ def _initialize_opentelemetry(self, otlp_endpoint: str, api_key: str):
             }
         }
         
-        logger.debug(f"   Final OTLP traces endpoint: {otlp_endpoint}")
-        logger.debug("   Headers: Authorization=ApiKey [REDACTED]")
-        
         self._initialize_opentelemetry_common(otlp_config)
     
     def _get_metrics_client_config(self, service_name: str, environment: str) -> Dict[str, Any]:
@@ -186,45 +169,31 @@ def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict
             api_key = provider_config.get("api_key", "")
             otlp_endpoint = provider_config.get("otlp_endpoint", "")
             
-            # Ensure endpoint is for traces
-            if not otlp_endpoint.endswith('/v1/traces'):
-                if otlp_endpoint.endswith('/'):
-                    otlp_endpoint = otlp_endpoint + 'v1/traces'
-                else:
-                    otlp_endpoint = otlp_endpoint + '/v1/traces'
-            
-            # Return Elastic OTLP configuration for Strands tracer
-            return {
-                "service_name": service_name,
-                "otlp_endpoint": otlp_endpoint,
-                "headers": {"Authorization": f"ApiKey {api_key}"},
-                "enable_console_export": False,
-                "resource_attributes": {
-                    "service.name": service_name,
-                    "service.version": "1.0.0",
-                    "deployment.environment": environment,
-                    "data_stream.dataset": provider_config.get("dataset", "generic.otel"),
-                    "data_stream.namespace": provider_config.get("namespace", "default")
-                }
+            # Use DRY helper to normalize endpoint and build config
+            normalized_endpoint = self._normalize_otlp_endpoint(otlp_endpoint, "/v1/traces")
+            headers = {"Authorization": f"ApiKey {api_key}"}
+            additional_attributes = {
+                "data_stream.dataset": provider_config.get("dataset", "generic.otel"),
+                "data_stream.namespace": provider_config.get("namespace", "default")
             }
             
+            # Log endpoint securely
+            self._log_endpoint_securely("Elastic tracer endpoint", normalized_endpoint)
+            
+            return self._build_standard_tracer_config(service_name, environment, normalized_endpoint, headers, additional_attributes)
+            
         except Exception as e:
-            print(f"⚠️ Error getting Strands tracer config for Elastic: {e}")
+            from secure_logging_utils import log_exception_safely
+            log_exception_safely(logger, "Elastic tracer config generation", e)
             return {}
     
     def _cleanup_environment_variables(self):
         """Clean up Elastic-specific environment variables."""
-        import os
         elastic_env_vars = [
             "ELASTIC_API_KEY", "OTEL_EXPORTER_OTLP_ENDPOINT", "OTEL_EXPORTER_OTLP_HEADERS"
         ]
         
-        removed_count = 0
-        for env_var in elastic_env_vars:
-            if env_var in os.environ:
-                del os.environ[env_var]
-                removed_count += 1
-        
+        removed_count = self._cleanup_environment_variables_by_list(elastic_env_vars)
         logging.debug(f"Removed {removed_count} Elastic environment variables")
     
     def _provider_specific_cleanup(self):
diff --git a/application_src/common/observability/langfuse.py b/application_src/common/observability/langfuse.py
index 4924dd0..d39b0a4 100644
--- a/application_src/common/observability/langfuse.py
+++ b/application_src/common/observability/langfuse.py
@@ -80,14 +80,14 @@ def initialize(self) -> Dict[str, Any]:
     
     def _initialize_opentelemetry(self, public_key: str, secret_key: str, host: str):
         """Initialize OpenTelemetry using common base class method."""
-        # Build auth header
+        # Build auth header (never logged)
         auth_string = f"{public_key}:{secret_key}"
         auth_header = base64.b64encode(auth_string.encode()).decode()
         
-        # Build endpoint and log securely
+        # Build endpoint and log securely (no clear text)
         endpoint = f"{host}/api/public/otel/v1/traces"
         self._log_endpoint_securely("Langfuse OTLP endpoint", endpoint)
-        logging.debug("🔑 Auth Header: ✅ Configured")
+        logging.debug("🔑 Auth Header: ✅ Configured (not logged)")
         
         # Use common initialization with Langfuse-specific config
         service_name, service_version = self._get_service_info()

From a1fec5b6c1290d7fcbbe217dba783c01bd4e20b7 Mon Sep 17 00:00:00 2001
From: Anuj Sharma <anshrma@amazon.com>
Date: Thu, 13 Nov 2025 14:46:32 -0800
Subject: [PATCH 6/7] refactor(observability): simplify credential validation
 to reduce security risks

- Replace verbose secure logging methods with minimal validation approach
- Remove _log_credentials_securely, _log_endpoint_securely methods
- Add _validate_credentials_safely method with reduced logging surface
- Update Datadog provider to use simplified credential validation
- Reduce potential for sensitive information leakage in logs
---
 application_src/common/observability/base.py  | 45 +++----------------
 .../common/observability/datadog.py           | 19 +++-----
 .../common/observability/dynatrace.py         | 17 +++----
 .../common/observability/elastic.py           | 17 +++----
 .../common/observability/langfuse.py          | 25 +++--------
 .../common/secure_logging_utils.py            | 11 +++--
 6 files changed, 36 insertions(+), 98 deletions(-)

diff --git a/application_src/common/observability/base.py b/application_src/common/observability/base.py
index 7e7c668..66b7d70 100644
--- a/application_src/common/observability/base.py
+++ b/application_src/common/observability/base.py
@@ -74,53 +74,20 @@ def _cleanup_environment_variables_by_list(self, env_var_list: list[str]) -> int
                 removed_count += 1
         return removed_count
     
-    def _log_credentials_securely(self, credentials_info: dict[str, Any]):
-        """Log credential validation status securely without exposing sensitive values."""
-        secure_logger = SecureLogger()
-        
-        for key, value in credentials_info.items():
-            if any(sensitive in key.lower() for sensitive in ['key', 'token', 'secret', 'password']):
-                status = '✅ Present' if value else '❌ Missing'
-                logging.info(f"   {key}: {status}")
-            elif 'endpoint' in key.lower() or 'host' in key.lower() or 'url' in key.lower():
-                # Hash endpoints/URLs as they may contain sensitive path info
-                hashed_value = secure_logger.hash_sensitive_value(str(value)) if value else 'NOT SET'
-                logging.info(f"   {key}: HASH:{hashed_value}")
-            else:
-                # Only log boolean, integer, or explicitly safe string values
-                if isinstance(value, (bool, int)) or key.lower() in ['service', 'environment', 'version']:
-                    logging.info(f"   {key}: {value}")
-                else:
-                    # For any other string values, hash them to be safe
-                    safe_value = secure_logger.hash_sensitive_value(str(value)) if value else 'NOT SET'
-                    logging.info(f"   {key}: HASH:{safe_value}")
-    
-    def _log_endpoint_securely(self, endpoint_name: str, endpoint_value: str):
-        """Log endpoint information securely."""
-        if endpoint_value:
-            secure_logger = SecureLogger()
-            hashed_endpoint = secure_logger.hash_sensitive_value(endpoint_value)
-            logging.debug(f"{endpoint_name}: HASH:{hashed_endpoint}")
-        else:
-            logging.warning(f"{endpoint_name}: NOT SET")
-    
-    def _validate_required_credentials(self, required_creds: dict[str, Any]) -> bool:
-        """Validate required credentials are present and log securely."""
-        secure_logger = SecureLogger()
+    def _validate_credentials_safely(self, credentials: list[tuple[str, Any]]) -> bool:
+        """Validate credentials are present with minimal logging to avoid security risks."""
         missing_creds = []
         
-        for cred_name, cred_value in required_creds.items():
+        # Simple validation - no verbose logging around sensitive operations
+        for cred_name, cred_value in credentials:
             if not cred_value:
                 missing_creds.append(cred_name)
-            else:
-                # Log that credential is present without exposing value
-                logging.debug(f"{cred_name}: ✅ Present")
         
         if missing_creds:
-            logging.error(f"❌ Missing required {self.provider_name} credentials: {', '.join(missing_creds)}")
+            logging.error(f"Missing required {self.provider_name} credentials: {', '.join(missing_creds)}")
             return False
         
-        logging.info(f"✅ All required {self.provider_name} credentials validated")
+        logging.info(f"✅ {self.provider_name} credentials validated")
         return True
     
     @abstractmethod
diff --git a/application_src/common/observability/datadog.py b/application_src/common/observability/datadog.py
index fc154fd..691b92b 100644
--- a/application_src/common/observability/datadog.py
+++ b/application_src/common/observability/datadog.py
@@ -38,22 +38,13 @@ def initialize(self) -> Dict[str, Any]:
             enable_llm_obs = provider_config.get("enable_llm_obs", True)
             enable_logs = provider_config.get("enable_logs", True)
             
-            # Use secure logging for credentials validation
-            logging.info("🔑 Datadog configuration validation:")
-            self._log_credentials_securely({
-                "api_key": api_key,
-                "site": site,
-                "environment": environment,
-                "service": service_name,
-                "version": version,
-                "llm_observability": enable_llm_obs,
-                "logs": enable_logs
-            })
-            
-            # Validate required credentials
-            if not self._validate_required_credentials({"api_key": api_key}):
+            # Simple credential validation with minimal logging
+            if not api_key:
+                logging.error("Datadog API key required but not provided")
                 return {}
             
+            logging.info("Datadog credentials validated")
+            
             # Set up environment variables for ddtrace
             os.environ["DD_API_KEY"] = api_key
             os.environ["DD_SITE"] = site
diff --git a/application_src/common/observability/dynatrace.py b/application_src/common/observability/dynatrace.py
index 729f277..c0f6d68 100644
--- a/application_src/common/observability/dynatrace.py
+++ b/application_src/common/observability/dynatrace.py
@@ -28,20 +28,13 @@ def initialize(self) -> Dict[str, Any]:
             dt_token = provider_config.get("dt_token", "")
             otlp_endpoint = provider_config.get("otlp_endpoint", "")
             
-            # Use secure logging for credentials validation
-            logging.info("🔑 Dynatrace configuration validation:")
-            self._log_credentials_securely({
-                "dt_token": dt_token,
-                "otlp_endpoint": otlp_endpoint
-            })
-            
-            # Validate required credentials
-            if not self._validate_required_credentials({
-                "dt_token": dt_token,
-                "otlp_endpoint": otlp_endpoint
-            }):
+            # Simple credential validation with minimal logging
+            if not dt_token or not otlp_endpoint:
+                logging.error("Dynatrace dt_token and otlp_endpoint required but not provided")
                 return {}
             
+            logging.info("Dynatrace credentials validated")
+            
             # Set up environment variables for Dynatrace (CRITICAL for Strands integration)
             os.environ["DT_TOKEN"] = dt_token
             os.environ["OTLP_ENDPOINT"] = otlp_endpoint
diff --git a/application_src/common/observability/elastic.py b/application_src/common/observability/elastic.py
index 4352208..ef4c609 100644
--- a/application_src/common/observability/elastic.py
+++ b/application_src/common/observability/elastic.py
@@ -29,20 +29,13 @@ def initialize(self) -> Dict[str, Any]:
             api_key = provider_config.get("api_key", "")
             otlp_endpoint = provider_config.get("otlp_endpoint", "")
             
-            # Use secure logging for credentials validation
-            logging.info("🔑 Elastic configuration validation:")
-            self._log_credentials_securely({
-                "api_key": api_key,
-                "otlp_endpoint": otlp_endpoint
-            })
-            
-            # Validate required credentials
-            if not self._validate_required_credentials({
-                "api_key": api_key,
-                "otlp_endpoint": otlp_endpoint
-            }):
+            # Simple credential validation with minimal logging
+            if not api_key or not otlp_endpoint:
+                logging.error("Elastic api_key and otlp_endpoint required but not provided")
                 return {}
             
+            logging.info("Elastic credentials validated")
+            
             # Set up environment variables for Elastic (CRITICAL for Strands integration)
             os.environ["ELASTIC_API_KEY"] = api_key
             os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = otlp_endpoint
diff --git a/application_src/common/observability/langfuse.py b/application_src/common/observability/langfuse.py
index d39b0a4..f0965c7 100644
--- a/application_src/common/observability/langfuse.py
+++ b/application_src/common/observability/langfuse.py
@@ -30,31 +30,20 @@ def initialize(self) -> Dict[str, Any]:
             secret_key = provider_config.get("secret_key", "")
             host = provider_config.get("host", "https://us.cloud.langfuse.com")
             
-            # Use secure logging for credentials validation
-            logging.info("🔑 Langfuse configuration validation:")
-            self._log_credentials_securely({
-                "public_key": public_key,
-                "secret_key": secret_key,
-                "host": host
-            })
-            
-            # Validate required credentials
-            if not self._validate_required_credentials({
-                "public_key": public_key,
-                "secret_key": secret_key
-            }):
+            # Simple credential validation with minimal logging
+            if not public_key or not secret_key:
+                logging.error("Langfuse public_key and secret_key required but not provided")
                 return {}
             
+            logging.info("Langfuse credentials validated")
+            
             # Set up environment variables for Langfuse (CRITICAL for Strands integration)
             os.environ["LANGFUSE_PUBLIC_KEY"] = public_key
             os.environ["LANGFUSE_SECRET_KEY"] = secret_key
             os.environ["LANGFUSE_HOST"] = host
             
-            # Log environment variable setup securely
-            logging.info("✅ Langfuse environment variables configured:")
-            logging.info("   LANGFUSE_PUBLIC_KEY: ✅ Set")
-            logging.info("   LANGFUSE_SECRET_KEY: ✅ Set") 
-            self._log_endpoint_securely("LANGFUSE_HOST", host)
+            # Minimal logging - avoid verbose logging around sensitive operations
+            logging.info("Langfuse environment variables configured")
             
             # CRITICAL: Initialize OpenTelemetry for Langfuse
             try:
diff --git a/application_src/common/secure_logging_utils.py b/application_src/common/secure_logging_utils.py
index 16e1fc4..b39cdd4 100644
--- a/application_src/common/secure_logging_utils.py
+++ b/application_src/common/secure_logging_utils.py
@@ -80,13 +80,18 @@ def log_exception_securely(self, logger_instance: logging.Logger, context: str,
     
     @staticmethod
     def hash_sensitive_value(value: str) -> str:
-        """Create a hash of sensitive value for logging."""
+        """Create a hash of sensitive value for logging purposes only (not password hashing).
+        
+        Note: This is for logging obfuscation, not password security.
+        For password hashing, use bcrypt, scrypt, or Argon2.
+        """
         if not value:
             return ""
         
-        # Use SHA-256 for consistent hashing
+        # Use SHA-256 for logging obfuscation only (NOT password hashing)
+        # This is appropriate for configuration/endpoint obfuscation in logs
         hash_obj = hashlib.sha256(value.encode('utf-8'))
-        return hash_obj.hexdigest()[:16]  # First 16 characters
+        return hash_obj.hexdigest()[:16]  # First 16 characters for logging
     
     @staticmethod
     def create_safe_context_info(context_data: Dict[str, Any]) -> Dict[str, str]:

From 2ba07ade59deb04fa1af109643a17b911d55e9bd Mon Sep 17 00:00:00 2001
From: Anuj Sharma <anshrma@amazon.com>
Date: Fri, 14 Nov 2025 13:15:45 -0800
Subject: [PATCH 7/7] refactor: update Datadog integration to follow official
 Strands SDK guidance

- Add OpenTelemetry environment variables for official Datadog integration
- Switch from OTLP intake to official trace agent endpoint
- Update headers to use dd-api-key and dd-otlp-source format
- Include new OTEL variables in cleanup process
- Follow documented best practices from Datadog LLM observability docs
---
 .../common/observability/datadog.py           | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/application_src/common/observability/datadog.py b/application_src/common/observability/datadog.py
index 691b92b..b803dfd 100644
--- a/application_src/common/observability/datadog.py
+++ b/application_src/common/observability/datadog.py
@@ -45,14 +45,21 @@ def initialize(self) -> Dict[str, Any]:
             
             logging.info("Datadog credentials validated")
             
-            # Set up environment variables for ddtrace
+            # Set up environment variables for official Datadog Strands SDK integration
             os.environ["DD_API_KEY"] = api_key
             os.environ["DD_SITE"] = site
             os.environ["DD_ENV"] = environment
             os.environ["DD_SERVICE"] = service_name
             os.environ["DD_VERSION"] = version
             
-            # Force agentless mode for ECS deployment - Official Datadog approach
+            # Official OpenTelemetry configuration for Strands SDK integration
+            # Following https://docs.datadoghq.com/llm_observability/instrumentation/otel_instrumentation/#using-strands-agents
+            os.environ["OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"] = "http/protobuf"
+            os.environ["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = f"https://trace.agent.{site}/v1/traces"
+            os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = f"dd-api-key={api_key},dd-otlp-source=datadog"
+            os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental"
+            
+            # Force agentless mode for ECS deployment
             os.environ["DD_TRACE_AGENT_URL"] = f"https://trace.agent.{site}"
             os.environ["DD_TRACE_API_VERSION"] = "v0.4"
             os.environ["DD_AGENT_HOST"] = ""
@@ -259,6 +266,7 @@ def _emit_log_with_client(self, log_data: Dict[str, Any], client_config: Dict[st
     def _cleanup_environment_variables(self):
         """Clean up Datadog-specific environment variables."""
         datadog_env_vars = [
+            # Datadog DD_* variables
             "DD_API_KEY", "DD_SITE", "DD_ENV", "DD_SERVICE", "DD_VERSION",
             "DD_TRACE_AGENT_URL", "DD_TRACE_API_VERSION", "DD_AGENT_HOST",
             "DD_DOGSTATSD_PORT", "DD_APM_DD_URL", "DD_LLMOBS_INTAKE_URL",
@@ -267,7 +275,11 @@ def _cleanup_environment_variables(self):
             "DD_TRACE_TLS_CA_CERT", "DD_TRACE_TLS_VERIFY", "DD_LLMOBS_TLS_VERIFY",
             "DD_TRACE_WRITER_BUFFER_SIZE_BYTES", "DD_TRACE_WRITER_MAX_PAYLOAD_SIZE",
             "DD_TRACE_WRITER_INTERVAL_SECONDS", "DATADOG_METRICS_ENABLED",
-            "DATADOG_SERVICE_NAME", "DATADOG_ENVIRONMENT"
+            "DATADOG_SERVICE_NAME", "DATADOG_ENVIRONMENT",
+            
+            # OpenTelemetry variables used for Datadog Strands SDK integration
+            "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT",
+            "OTEL_EXPORTER_OTLP_TRACES_HEADERS", "OTEL_SEMCONV_STABILITY_OPT_IN"
         ]
         
         removed_count = self._cleanup_environment_variables_by_list(datadog_env_vars)
@@ -308,7 +320,7 @@ def _provider_specific_cleanup(self):
             logging.warning(f"Error in Datadog-specific cleanup: {e}")
     
     def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict[str, Any]:
-        """Get configuration for Strands get_tracer() to send traces to Datadog."""
+        """Get configuration for Strands get_tracer() following official Datadog guidance."""
         # CRITICAL: Only return config if this provider is currently active
         if not self._validate_provider_is_active():
             logging.debug(f"Skipping tracer config for inactive {self.provider_name} provider")
@@ -319,12 +331,17 @@ def get_strands_tracer_config(self, service_name: str, environment: str) -> Dict
             site = provider_config.get("site", "datadoghq.com")
             api_key = provider_config.get("api_key", "")
             
-            # Use DRY helper to build standard tracer config with secure logging
-            endpoint = f"https://otlp-intake.{site}/v1/traces"
-            headers = {"DD-API-KEY": api_key}
+            # Follow official Datadog guidance for Strands SDK integration
+            # https://docs.datadoghq.com/llm_observability/instrumentation/otel_instrumentation/#using-strands-agents
+            
+            # Use official Datadog trace agent endpoint (not OTLP intake)
+            endpoint = f"https://trace.agent.{site}/v1/traces"
+            headers = {
+                "dd-api-key": api_key,
+                "dd-otlp-source": "datadog"
+            }
             
-            # Log endpoint securely
-            self._log_endpoint_securely("Datadog OTLP endpoint", endpoint)
+            logging.debug("Using official Datadog trace agent endpoint for Strands integration")
             
             return self._build_standard_tracer_config(service_name, environment, endpoint, headers)