diff --git a/server/links/diet/README.md b/server/links/diet/README.md index dc73c39..b37e4ad 100644 --- a/server/links/diet/README.md +++ b/server/links/diet/README.md @@ -5,7 +5,8 @@ The Diet link is a specialized plugin that helps reduce the size and content of ## Features - Selective removal of dialog body content -- Optional media redirection to external storage +- Optional media redirection to external storage (HTTP endpoint or S3) +- S3 storage with presigned URL generation for secure access - Removal of analysis data - Filtering of attachments by MIME type - Removal of system prompts to prevent LLM instruction injection @@ -20,6 +21,13 @@ default_options = { "remove_analysis": False, # Remove all analysis data "remove_attachment_types": [], # List of attachment types to remove (e.g., ["image/jpeg", "audio/mp3"]) "remove_system_prompts": False, # Remove system_prompt keys to prevent LLM instruction insertion + # S3 storage options for dialog bodies + "s3_bucket": "", # S3 bucket name for storing dialog bodies + "s3_path": "", # Optional path prefix within the bucket + "aws_access_key_id": "", # AWS access key ID + "aws_secret_access_key": "", # AWS secret access key + "aws_region": "us-east-1", # AWS region (default: us-east-1) + "presigned_url_expiration": None, # Presigned URL expiration in seconds (None = default 1 hour) } ``` @@ -31,6 +39,15 @@ default_options = { - `remove_attachment_types`: List of MIME types to remove from attachments - `remove_system_prompts`: Whether to remove system_prompt keys to prevent LLM instruction injection +### S3 Storage Options + +- `s3_bucket`: The S3 bucket name where dialog bodies will be stored +- `s3_path`: Optional path prefix within the bucket (e.g., "dialogs/processed") +- `aws_access_key_id`: AWS access key ID for authentication +- `aws_secret_access_key`: AWS secret access key for authentication +- `aws_region`: AWS region where the bucket is located (default: "us-east-1") +- `presigned_url_expiration`: Expiration time in seconds for presigned URLs (optional, defaults to 3600 seconds / 1 hour) + ## Usage The link processes vCons by: @@ -42,9 +59,39 @@ The link processes vCons by: - Removing system prompts if specified 3. Storing the modified vCon back in Redis -## Media Redirection +## Media Storage Options + +The diet link supports two methods for storing dialog bodies externally: + +### S3 Storage (Recommended) + +When `s3_bucket` is configured, the link will: +1. Upload dialog body content to the specified S3 bucket +2. Generate a presigned URL for secure access +3. Replace the body content with the presigned URL +4. Set the body_type to "url" +5. If the upload fails, the body content will be removed + +**S3 takes precedence over HTTP endpoint** - if both `s3_bucket` and `post_media_to_url` are configured, S3 will be used. + +Example S3 configuration: +```python +{ + "remove_dialog_body": True, + "s3_bucket": "my-vcon-storage", + "s3_path": "dialogs/archived", + "aws_access_key_id": "AKIAXXXXXXXX", + "aws_secret_access_key": "xxxxxxxxxxxxx", + "aws_region": "us-west-2", + "presigned_url_expiration": 86400, # 24 hours +} +``` + +The S3 key structure is: `{s3_path}/{vcon_uuid}/{dialog_id}_{unique_id}.txt` + +### HTTP Endpoint Storage -When `post_media_to_url` is configured, the link will: +When `post_media_to_url` is configured (and `s3_bucket` is not), the link will: 1. Post the media content to the specified URL 2. Replace the body content with the URL to the stored content 3. Set the body_type to "url" @@ -59,7 +106,8 @@ When `post_media_to_url` is configured, the link will: ## Dependencies - Redis for vCon storage -- Requests library for media redirection +- Requests library for HTTP media redirection +- boto3 library for S3 storage - Custom utilities: - logging_utils @@ -67,4 +115,7 @@ When `post_media_to_url` is configured, the link will: - Redis connection must be configured - Appropriate permissions for vCon access and storage -- If using media redirection, a valid endpoint URL must be provided \ No newline at end of file +- If using HTTP media redirection, a valid endpoint URL must be provided +- If using S3 storage: + - Valid AWS credentials with write access to the specified bucket + - The bucket must exist and be accessible \ No newline at end of file diff --git a/server/links/diet/__init__.py b/server/links/diet/__init__.py index a064ca8..4b1ea8f 100644 --- a/server/links/diet/__init__.py +++ b/server/links/diet/__init__.py @@ -2,11 +2,34 @@ from lib.logging_utils import init_logger import json import requests +import uuid +import boto3 +from botocore.exceptions import ClientError from typing import Dict, List, Any, Optional logger = init_logger(__name__) logger.info("MDO THIS SHOULD PRINT") +_REDACTED = "[REDACTED]" + + +def _redact_option_value(key: str, value: Any) -> Any: + """ + Redact sensitive option values before logging. + + This prevents leaking secrets (for example AWS credentials) into logs. + """ + key_l = (key or "").lower() + if ( + key_l == "aws_secret_access_key" + or "secret" in key_l + or "password" in key_l + or "token" in key_l + or key_l.endswith("_secret") + ): + return _REDACTED + return value + # Default options that control which elements to remove default_options = { @@ -15,8 +38,93 @@ "remove_analysis": False, # Remove all analysis data "remove_attachment_types": [], # List of attachment types to remove (e.g., ["image/jpeg", "audio/mp3"]) "remove_system_prompts": False, # Remove system_prompt keys to prevent LLM instruction insertion + # S3 storage options for dialog bodies + "s3_bucket": "", # S3 bucket name for storing dialog bodies + "s3_path": "", # Optional path prefix within the bucket + "aws_access_key_id": "", # AWS access key ID + "aws_secret_access_key": "", # AWS secret access key + "aws_region": "us-east-1", # AWS region (default: us-east-1) + "presigned_url_expiration": None, # Presigned URL expiration in seconds (None = no expiration/default 1 hour) } + +def _get_s3_client(options: Dict[str, Any]): + """Create and return an S3 client with the provided credentials.""" + return boto3.client( + "s3", + aws_access_key_id=options["aws_access_key_id"], + aws_secret_access_key=options["aws_secret_access_key"], + region_name=options.get("aws_region", "us-east-1"), + ) + + +def _upload_to_s3_and_get_presigned_url( + content: str, + vcon_uuid: str, + dialog_id: str, + options: Dict[str, Any] +) -> Optional[str]: + """ + Upload dialog body content to S3 and return a presigned URL. + + Args: + content: The dialog body content to upload + vcon_uuid: The vCon UUID + dialog_id: The dialog ID + options: Configuration options including S3 credentials and bucket info + + Returns: + Presigned URL to access the uploaded content, or None if upload fails + """ + try: + s3 = _get_s3_client(options) + + # Generate a unique key for this dialog body + unique_id = str(uuid.uuid4()) + key = f"{dialog_id}_{unique_id}.txt" if dialog_id else f"{unique_id}.txt" + + # Add vcon_uuid as a directory level + key = f"{vcon_uuid}/{key}" + + # Add optional path prefix + if options.get("s3_path"): + key = f"{options['s3_path']}/{key}" + + bucket = options["s3_bucket"] + + # Upload the content + s3.put_object( + Bucket=bucket, + Key=key, + Body=content.encode("utf-8") if isinstance(content, str) else content, + ContentType="text/plain", + ) + + logger.info(f"Successfully uploaded dialog body to s3://{bucket}/{key}") + + # Generate presigned URL + expiration = options.get("presigned_url_expiration") + if expiration is None: + # Default to 1 hour (3600 seconds) if not specified + expiration = 3600 + + presigned_url = s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket, "Key": key}, + ExpiresIn=expiration, + ) + + logger.info(f"Generated presigned URL with expiration {expiration}s") + return presigned_url + + except ClientError as e: + logger.error(f"S3 client error uploading dialog body: {e}") + return None + except Exception as e: + logger.error(f"Exception uploading dialog body to S3: {e}") + return None + + def run(vcon_uuid, link_name, opts=default_options): logger.info("Starting diet::run") @@ -24,7 +132,7 @@ def run(vcon_uuid, link_name, opts=default_options): options = {**default_options, **opts} for key, value in options.items(): - logger.info(f"diet::{key}: {value}") + logger.info("diet::%s: %s", key, _redact_option_value(key, value)) # Load vCon from Redis using JSON.GET vcon = redis.json().get(f"vcon:{vcon_uuid}") @@ -41,12 +149,27 @@ def run(vcon_uuid, link_name, opts=default_options): logger.info("diet::got dialog") if options["remove_dialog_body"] and "body" in dialog: logger.info("diet::remove_dialog_body AND body") - if options["post_media_to_url"] and dialog.get("body"): + dialog_body = dialog.get("body") + dialog_id = dialog.get("id", "") + + # Check if S3 storage is configured + if options.get("s3_bucket") and dialog_body: + logger.info("diet::uploading to S3") + presigned_url = _upload_to_s3_and_get_presigned_url( + dialog_body, vcon_uuid, dialog_id, options + ) + if presigned_url: + dialog["body"] = presigned_url + dialog["body_type"] = "url" + else: + logger.error("Failed to upload to S3, removing body") + dialog["body"] = "" + elif options["post_media_to_url"] and dialog_body: try: # Post the body content to the specified URL response = requests.post( options["post_media_to_url"], - json={"content": dialog["body"], "vcon_uuid": vcon_uuid, "dialog_id": dialog.get("id", "")} + json={"content": dialog_body, "vcon_uuid": vcon_uuid, "dialog_id": dialog_id} ) if response.status_code == 200: # Replace body with the URL to the stored content diff --git a/server/links/diet/test_diet.py b/server/links/diet/test_diet.py index 61eeeaf..1a18e90 100644 --- a/server/links/diet/test_diet.py +++ b/server/links/diet/test_diet.py @@ -1,8 +1,10 @@ import json import pytest +import logging from unittest.mock import patch, MagicMock +from botocore.exceptions import ClientError -from server.links.diet import run, default_options, remove_system_prompts_recursive +from server.links.diet import run, default_options, remove_system_prompts_recursive, _upload_to_s3_and_get_presigned_url @pytest.fixture def sample_vcon(): @@ -269,4 +271,232 @@ def test_combined_options(mock_redis, sample_vcon): assert "analysis" not in saved_vcon assert len(saved_vcon["attachments"]) == 2 assert saved_vcon["attachments"][0]["id"] == "att2" - assert "system_prompt" not in saved_vcon["attachments"][1]["metadata"] \ No newline at end of file + assert "system_prompt" not in saved_vcon["attachments"][1]["metadata"] + + +@pytest.fixture +def s3_options(): + return { + "remove_dialog_body": True, + "s3_bucket": "test-bucket", + "s3_path": "dialogs", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + "aws_region": "us-west-2", + "presigned_url_expiration": 7200, + } + + +@patch('server.links.diet.boto3') +def test_upload_to_s3_and_get_presigned_url(mock_boto3): + # Test the S3 upload helper function + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://test-bucket.s3.amazonaws.com/presigned-url" + + options = { + "s3_bucket": "test-bucket", + "s3_path": "dialogs", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + "aws_region": "us-west-2", + "presigned_url_expiration": 7200, + } + + result = _upload_to_s3_and_get_presigned_url( + "test content", + "vcon-uuid-123", + "dialog1", + options + ) + + # Verify S3 client was created with correct credentials + mock_boto3.client.assert_called_once_with( + "s3", + aws_access_key_id="test-key-id", + aws_secret_access_key="test-secret-key", + region_name="us-west-2", + ) + + # Verify put_object was called + mock_s3.put_object.assert_called_once() + call_kwargs = mock_s3.put_object.call_args[1] + assert call_kwargs["Bucket"] == "test-bucket" + assert "dialogs/vcon-uuid-123/dialog1_" in call_kwargs["Key"] + assert call_kwargs["ContentType"] == "text/plain" + + # Verify presigned URL was generated with correct expiration + mock_s3.generate_presigned_url.assert_called_once() + presign_call = mock_s3.generate_presigned_url.call_args + assert presign_call[0][0] == "get_object" + assert presign_call[1]["ExpiresIn"] == 7200 + + assert result == "https://test-bucket.s3.amazonaws.com/presigned-url" + + +@patch('server.links.diet.boto3') +def test_upload_to_s3_default_expiration(mock_boto3): + # Test that default expiration (3600) is used when not specified + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://presigned-url" + + options = { + "s3_bucket": "test-bucket", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + "presigned_url_expiration": None, # Not specified + } + + _upload_to_s3_and_get_presigned_url("content", "vcon-123", "dialog1", options) + + # Verify default expiration of 3600 seconds was used + presign_call = mock_s3.generate_presigned_url.call_args + assert presign_call[1]["ExpiresIn"] == 3600 + + +@patch('server.links.diet.boto3') +def test_upload_to_s3_no_path_prefix(mock_boto3): + # Test S3 upload without path prefix + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://presigned-url" + + options = { + "s3_bucket": "test-bucket", + "s3_path": "", # No path prefix + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + } + + _upload_to_s3_and_get_presigned_url("content", "vcon-123", "dialog1", options) + + # Verify key doesn't have prefix + call_kwargs = mock_s3.put_object.call_args[1] + assert call_kwargs["Key"].startswith("vcon-123/dialog1_") + assert not call_kwargs["Key"].startswith("/") + + +@patch('server.links.diet.boto3') +def test_upload_to_s3_failure(mock_boto3): + # Test handling of S3 upload failure + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.put_object.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "Access Denied"}}, + "PutObject" + ) + + options = { + "s3_bucket": "test-bucket", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + } + + result = _upload_to_s3_and_get_presigned_url("content", "vcon-123", "dialog1", options) + + assert result is None + + +@patch('server.links.diet.redis') +@patch('server.links.diet.boto3') +def test_run_with_s3_storage(mock_boto3, mock_redis, sample_vcon, s3_options): + # Test the full run function with S3 storage + mock_json = MagicMock() + mock_redis.json.return_value = mock_json + mock_json.get.return_value = sample_vcon + + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://test-bucket.s3.amazonaws.com/presigned-url" + + run("test-vcon-123", "diet", s3_options) + + # Verify JSON.SET was called + mock_json.set.assert_called_once() + args, kwargs = mock_json.set.call_args + saved_vcon = args[2] + + # Check that dialog bodies were replaced with presigned URLs + assert saved_vcon["dialog"][0]["body"] == "https://test-bucket.s3.amazonaws.com/presigned-url" + assert saved_vcon["dialog"][0]["body_type"] == "url" + assert saved_vcon["dialog"][1]["body"] == "https://test-bucket.s3.amazonaws.com/presigned-url" + assert saved_vcon["dialog"][1]["body_type"] == "url" + + # Verify S3 was called for each dialog + assert mock_s3.put_object.call_count == 2 + + +@patch('server.links.diet.redis') +@patch('server.links.diet.boto3') +def test_run_with_s3_storage_failure_removes_body(mock_boto3, mock_redis, sample_vcon, s3_options): + # Test that body is removed when S3 upload fails + mock_json = MagicMock() + mock_redis.json.return_value = mock_json + mock_json.get.return_value = sample_vcon + + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.put_object.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "Access Denied"}}, + "PutObject" + ) + + run("test-vcon-123", "diet", s3_options) + + # Verify JSON.SET was called + mock_json.set.assert_called_once() + args, kwargs = mock_json.set.call_args + saved_vcon = args[2] + + # Check that dialog bodies were removed due to failure + assert saved_vcon["dialog"][0]["body"] == "" + assert saved_vcon["dialog"][1]["body"] == "" + + +@patch('server.links.diet.redis') +@patch('server.links.diet.boto3') +def test_s3_takes_precedence_over_post_url(mock_boto3, mock_redis, sample_vcon): + # Test that S3 storage takes precedence over post_media_to_url + mock_json = MagicMock() + mock_redis.json.return_value = mock_json + mock_json.get.return_value = sample_vcon + + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://s3-presigned-url" + + options = { + "remove_dialog_body": True, + "s3_bucket": "test-bucket", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + "post_media_to_url": "https://should-not-be-called.com", # This should be ignored + } + + with patch('server.links.diet.requests.post') as mock_post: + run("test-vcon-123", "diet", options) + # post_media_to_url should not be called when S3 is configured + mock_post.assert_not_called() + + # Verify S3 was used instead + assert mock_s3.put_object.call_count == 2 + + +@patch('server.links.diet.redis') +def test_options_logging_redacts_aws_secret_access_key(mock_redis, sample_vcon, caplog): + # Ensure secrets are not written to logs + mock_json = MagicMock() + mock_redis.json.return_value = mock_json + mock_json.get.return_value = sample_vcon + + secret = "test-secret-key" + caplog.set_level(logging.INFO) + + run("test-vcon-123", "diet", { + "remove_dialog_body": False, + "aws_secret_access_key": secret, + }) + + assert secret not in caplog.text + assert "diet::aws_secret_access_key: [REDACTED]" in caplog.text \ No newline at end of file