diff --git a/example_config.yml b/example_config.yml index a12083b..cb559bd 100644 --- a/example_config.yml +++ b/example_config.yml @@ -125,6 +125,9 @@ storages: aws_access_key_id: some_key aws_secret_access_key: some_secret aws_bucket: some_bucket + aws_region: us-east-1 # AWS region where the bucket is located + # endpoint_url: null # Optional: custom endpoint for S3-compatible services + # s3_path: vcons # Optional: prefix for S3 keys milvus: module: storage.milvus options: diff --git a/server/storage/s3/README.md b/server/storage/s3/README.md index 83d4b75..6a4ef74 100644 --- a/server/storage/s3/README.md +++ b/server/storage/s3/README.md @@ -8,31 +8,51 @@ S3 storage provides scalable, durable object storage capabilities, making it ide ## Configuration -Required configuration options: +Configuration options: ```yaml storages: s3: module: storage.s3 options: - bucket: your-bucket-name # S3 bucket name - region: us-west-2 # AWS region - access_key: your-access-key # AWS access key - secret_key: your-secret-key # AWS secret key - prefix: vcons/ # Optional: key prefix - endpoint_url: null # Optional: custom endpoint + # Required options + aws_access_key_id: your-access-key # AWS access key ID + aws_secret_access_key: your-secret-key # AWS secret access key + aws_bucket: your-bucket-name # S3 bucket name + + # Optional options + aws_region: us-east-1 # AWS region (recommended to avoid cross-region errors) + endpoint_url: null # Custom endpoint for S3-compatible services (e.g., MinIO) + s3_path: vcons # Prefix for S3 keys (optional) +``` + +### Configuration Options + +| Option | Required | Description | +|--------|----------|-------------| +| `aws_access_key_id` | Yes | AWS access key ID for authentication | +| `aws_secret_access_key` | Yes | AWS secret access key for authentication | +| `aws_bucket` | Yes | Name of the S3 bucket to store vCons | +| `aws_region` | No | AWS region where the bucket is located (e.g., `us-east-1`, `us-west-2`, `eu-west-1`). **Recommended** to avoid "AuthorizationHeaderMalformed" errors when the bucket is in a different region than the default. | +| `endpoint_url` | No | Custom endpoint URL for S3-compatible services like MinIO, LocalStack, or other providers | +| `s3_path` | No | Prefix path for organizing vCon objects within the bucket | + +### Region Configuration + +**Important:** If your S3 bucket is in a region other than `us-east-1`, you should explicitly set the `aws_region` option. Without this, you may encounter errors like: + +``` +AuthorizationHeaderMalformed: The authorization header is malformed; +the region 'us-east-1' is wrong; expecting 'us-east-2' ``` ## Features -- Object storage -- High availability -- Durability -- Versioning support -- Lifecycle management -- Automatic metrics logging -- Encryption support -- Access control +- Object storage with automatic date-based key organization (`YYYY/MM/DD/uuid.vcon`) +- High availability and durability +- Support for custom S3-compatible endpoints (MinIO, LocalStack, etc.) +- Configurable key prefix for organizing objects +- Automatic error logging ## Usage @@ -42,21 +62,24 @@ from storage import Storage # Initialize S3 storage s3_storage = Storage("s3") -# Save vCon data +# Save vCon data (retrieves from Redis and stores in S3) s3_storage.save(vcon_id) # Retrieve vCon data vcon_data = s3_storage.get(vcon_id) ``` -## Implementation Details +## Key Structure + +vCons are stored with keys following this pattern: +``` +[s3_path/]YYYY/MM/DD/uuid.vcon +``` -The S3 storage implementation: -- Uses boto3 for AWS S3 operations -- Implements retry logic -- Supports multipart uploads -- Provides encryption -- Includes automatic metrics logging +For example, a vCon created on January 15, 2024 with UUID `abc123` and `s3_path: vcons` would be stored at: +``` +vcons/2024/01/15/abc123.vcon +``` ## Dependencies @@ -65,15 +88,11 @@ The S3 storage implementation: ## Best Practices -1. Secure credential management -2. Implement proper access control -3. Use appropriate storage classes -4. Enable versioning -5. Configure lifecycle rules -6. Implement proper error handling -7. Use appropriate encryption -8. Monitor costs -9. Implement retry logic -10. Use appropriate regions -11. Enable logging -12. Regular backup verification \ No newline at end of file +1. Always configure `aws_region` to match your bucket's region +2. Use IAM roles with least-privilege access +3. Enable bucket versioning for data protection +4. Configure lifecycle rules for cost optimization +5. Enable server-side encryption +6. Use VPC endpoints for private connectivity +7. Monitor with CloudWatch metrics +8. Enable access logging for auditing \ No newline at end of file diff --git a/server/storage/s3/__init__.py b/server/storage/s3/__init__.py index 876009d..ef37a51 100644 --- a/server/storage/s3/__init__.py +++ b/server/storage/s3/__init__.py @@ -3,7 +3,6 @@ from lib.logging_utils import init_logger from server.lib.vcon_redis import VconRedis import boto3 -from datetime import datetime logger = init_logger(__name__) @@ -11,6 +10,39 @@ default_options = {} +def _create_s3_client(opts: dict): + """Create an S3 client with the provided options. + + Required options: + aws_access_key_id: AWS access key ID + aws_secret_access_key: AWS secret access key + + Optional options: + aws_region: AWS region (e.g., 'us-east-1', 'us-west-2') + endpoint_url: Custom endpoint URL for S3-compatible services + """ + client_kwargs = { + "aws_access_key_id": opts["aws_access_key_id"], + "aws_secret_access_key": opts["aws_secret_access_key"], + } + + if opts.get("aws_region"): + client_kwargs["region_name"] = opts["aws_region"] + + if opts.get("endpoint_url"): + client_kwargs["endpoint_url"] = opts["endpoint_url"] + + return boto3.client("s3", **client_kwargs) + + +def _build_s3_key(vcon_uuid: str, s3_path: Optional[str] = None) -> str: + """Build the S3 object key for a vCon.""" + key = f"{vcon_uuid}.vcon" + if not s3_path: + return key + return f"{s3_path.rstrip('/')}/{key}" + + def save( vcon_uuid, opts=default_options, @@ -19,19 +51,9 @@ def save( try: vcon_redis = VconRedis() vcon = vcon_redis.get_vcon(vcon_uuid) - s3 = boto3.client( - "s3", - aws_access_key_id=opts["aws_access_key_id"], - aws_secret_access_key=opts["aws_secret_access_key"], - ) + s3 = _create_s3_client(opts) - s3_path = opts.get("s3_path") - created_at = datetime.fromisoformat(vcon.created_at) - timestamp = created_at.strftime("%Y/%m/%d") - key = vcon_uuid + ".vcon" - destination_directory = f"{timestamp}/{key}" - if s3_path: - destination_directory = s3_path + "/" + destination_directory + destination_directory = _build_s3_key(vcon_uuid, opts.get("s3_path")) s3.put_object( Bucket=opts["aws_bucket"], Key=destination_directory, Body=vcon.dumps() ) @@ -45,15 +67,10 @@ def save( def get(vcon_uuid: str, opts=default_options) -> Optional[dict]: """Get a vCon from S3 by UUID.""" try: - s3 = boto3.client( - "s3", - aws_access_key_id=opts["aws_access_key_id"], - aws_secret_access_key=opts["aws_secret_access_key"], - ) - - s3_path = opts.get("s3_path", "") - key = f"{s3_path}/{vcon_uuid}.vcon" if s3_path else f"{vcon_uuid}.vcon" - + s3 = _create_s3_client(opts) + + key = _build_s3_key(vcon_uuid, opts.get("s3_path")) + response = s3.get_object(Bucket=opts["aws_bucket"], Key=key) return json.loads(response['Body'].read().decode('utf-8')) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/storage/__init__.py b/tests/storage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/storage/test_s3.py b/tests/storage/test_s3.py new file mode 100644 index 0000000..70d5d70 --- /dev/null +++ b/tests/storage/test_s3.py @@ -0,0 +1,499 @@ +"""Comprehensive tests for S3 storage module. + +Tests cover: +- S3 client creation with various configurations +- Region configuration handling +- Custom endpoint URL support +- Save and get operations +- Error handling +""" + +import json +import pytest +import sys +from unittest.mock import patch, MagicMock, Mock +from datetime import datetime +from io import BytesIO + +# Mock the logger module before importing the S3 module +mock_logger = MagicMock() +sys.modules["lib.logging_utils"] = MagicMock(init_logger=MagicMock(return_value=mock_logger)) +sys.modules["server.lib.vcon_redis"] = MagicMock() + +from server.storage.s3 import _create_s3_client, _build_s3_key, save, get, default_options + + +class TestCreateS3Client: + """Tests for the _create_s3_client helper function.""" + + def test_create_client_with_required_options_only(self): + """Test client creation with only required options.""" + opts = { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + mock_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + ) + + def test_create_client_with_region(self): + """Test client creation with aws_region specified.""" + opts = { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "aws_region": "us-west-2", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + mock_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + region_name="us-west-2", + ) + + def test_create_client_with_us_east_2_region(self): + """Test client creation with us-east-2 region (the error case from issue).""" + opts = { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "aws_region": "us-east-2", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + mock_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + region_name="us-east-2", + ) + + def test_create_client_with_endpoint_url(self): + """Test client creation with custom endpoint URL (e.g., MinIO).""" + opts = { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "endpoint_url": "http://localhost:9000", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + mock_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + endpoint_url="http://localhost:9000", + ) + + def test_create_client_with_all_options(self): + """Test client creation with all options specified.""" + opts = { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "aws_region": "eu-west-1", + "endpoint_url": "https://s3.eu-west-1.amazonaws.com", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + mock_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + region_name="eu-west-1", + endpoint_url="https://s3.eu-west-1.amazonaws.com", + ) + + def test_create_client_ignores_empty_region(self): + """Test that empty string region is ignored.""" + opts = { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "aws_region": "", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + mock_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + ) + + def test_create_client_ignores_none_region(self): + """Test that None region is ignored.""" + opts = { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "aws_region": None, + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + mock_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + ) + + def test_create_client_with_various_aws_regions(self): + """Test client creation with various AWS regions.""" + regions = [ + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", + "eu-west-1", + "eu-west-2", + "eu-central-1", + "ap-southeast-1", + "ap-northeast-1", + "sa-east-1", + ] + + for region in regions: + opts = { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "aws_region": region, + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + mock_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + region_name=region, + ) + + +class TestBuildS3Key: + """Tests for the _build_s3_key helper function.""" + + def test_build_key_without_prefix(self): + """Test key building without s3_path prefix.""" + key = _build_s3_key("test-uuid") + assert key == "test-uuid.vcon" + + def test_build_key_with_prefix(self): + """Test key building with s3_path prefix.""" + key = _build_s3_key("test-uuid", "vcons") + assert key == "vcons/test-uuid.vcon" + + def test_build_key_with_trailing_slash_prefix(self): + """Test key building with trailing slash in prefix.""" + key = _build_s3_key("test-uuid", "vcons/") + assert key == "vcons/test-uuid.vcon" + + def test_build_key_with_none_prefix(self): + """Test key building with None prefix.""" + key = _build_s3_key("test-uuid", None) + assert key == "test-uuid.vcon" + + def test_build_key_with_empty_prefix(self): + """Test key building with empty string prefix.""" + key = _build_s3_key("test-uuid", "") + assert key == "test-uuid.vcon" + + def test_build_key_with_nested_prefix(self): + """Test key building with nested prefix.""" + key = _build_s3_key("test-uuid", "data/vcons/archive") + assert key == "data/vcons/archive/test-uuid.vcon" + + +class TestSave: + """Tests for the save function.""" + + @pytest.fixture + def mock_vcon(self): + """Create a mock vCon object.""" + mock = MagicMock() + mock.dumps.return_value = '{"uuid": "test-uuid", "vcon": "1.0.0"}' + return mock + + @pytest.fixture + def base_opts(self): + """Base options for S3 storage.""" + return { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "aws_bucket": "test-bucket", + } + + def test_save_basic(self, mock_vcon, base_opts): + """Test basic save operation.""" + with patch("server.storage.s3.VconRedis") as mock_redis_class, \ + patch("server.storage.s3.boto3.client") as mock_boto_client: + + mock_redis = MagicMock() + mock_redis.get_vcon.return_value = mock_vcon + mock_redis_class.return_value = mock_redis + + mock_s3 = MagicMock() + mock_boto_client.return_value = mock_s3 + + save("test-uuid", base_opts) + + mock_redis.get_vcon.assert_called_once_with("test-uuid") + mock_s3.put_object.assert_called_once() + + call_args = mock_s3.put_object.call_args + assert call_args.kwargs["Bucket"] == "test-bucket" + assert call_args.kwargs["Key"] == "test-uuid.vcon" + + def test_save_with_s3_path_prefix(self, mock_vcon, base_opts): + """Test save operation with s3_path prefix.""" + base_opts["s3_path"] = "vcons" + + with patch("server.storage.s3.VconRedis") as mock_redis_class, \ + patch("server.storage.s3.boto3.client") as mock_boto_client: + + mock_redis = MagicMock() + mock_redis.get_vcon.return_value = mock_vcon + mock_redis_class.return_value = mock_redis + + mock_s3 = MagicMock() + mock_boto_client.return_value = mock_s3 + + save("test-uuid", base_opts) + + call_args = mock_s3.put_object.call_args + assert call_args.kwargs["Key"] == "vcons/test-uuid.vcon" + + def test_save_with_region(self, mock_vcon, base_opts): + """Test save operation with region specified.""" + base_opts["aws_region"] = "us-east-2" + + with patch("server.storage.s3.VconRedis") as mock_redis_class, \ + patch("server.storage.s3.boto3.client") as mock_boto_client: + + mock_redis = MagicMock() + mock_redis.get_vcon.return_value = mock_vcon + mock_redis_class.return_value = mock_redis + + mock_s3 = MagicMock() + mock_boto_client.return_value = mock_s3 + + save("test-uuid", base_opts) + + mock_boto_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + region_name="us-east-2", + ) + + def test_save_raises_exception_on_error(self, mock_vcon, base_opts): + """Test that save raises exception on S3 error.""" + with patch("server.storage.s3.VconRedis") as mock_redis_class, \ + patch("server.storage.s3.boto3.client") as mock_boto_client: + + mock_redis = MagicMock() + mock_redis.get_vcon.return_value = mock_vcon + mock_redis_class.return_value = mock_redis + + mock_s3 = MagicMock() + mock_s3.put_object.side_effect = Exception("S3 Error") + mock_boto_client.return_value = mock_s3 + + with pytest.raises(Exception, match="S3 Error"): + save("test-uuid", base_opts) + + +class TestGet: + """Tests for the get function.""" + + @pytest.fixture + def base_opts(self): + """Base options for S3 storage.""" + return { + "aws_access_key_id": "test-access-key", + "aws_secret_access_key": "test-secret-key", + "aws_bucket": "test-bucket", + } + + def test_get_basic(self, base_opts): + """Test basic get operation.""" + vcon_data = {"uuid": "test-uuid", "vcon": "1.0.0"} + + with patch("server.storage.s3.boto3.client") as mock_boto_client: + mock_s3 = MagicMock() + mock_response = { + "Body": BytesIO(json.dumps(vcon_data).encode("utf-8")) + } + mock_s3.get_object.return_value = mock_response + mock_boto_client.return_value = mock_s3 + + result = get("test-uuid", base_opts) + + assert result == vcon_data + mock_s3.get_object.assert_called_once_with( + Bucket="test-bucket", + Key="test-uuid.vcon" + ) + + def test_get_with_s3_path_prefix(self, base_opts): + """Test get operation with s3_path prefix.""" + base_opts["s3_path"] = "vcons" + vcon_data = {"uuid": "test-uuid"} + + with patch("server.storage.s3.boto3.client") as mock_boto_client: + mock_s3 = MagicMock() + mock_response = { + "Body": BytesIO(json.dumps(vcon_data).encode("utf-8")) + } + mock_s3.get_object.return_value = mock_response + mock_boto_client.return_value = mock_s3 + + result = get("test-uuid", base_opts) + + mock_s3.get_object.assert_called_once_with( + Bucket="test-bucket", + Key="vcons/test-uuid.vcon" + ) + + def test_get_with_region(self, base_opts): + """Test get operation with region specified.""" + base_opts["aws_region"] = "eu-west-1" + vcon_data = {"uuid": "test-uuid"} + + with patch("server.storage.s3.boto3.client") as mock_boto_client: + mock_s3 = MagicMock() + mock_response = { + "Body": BytesIO(json.dumps(vcon_data).encode("utf-8")) + } + mock_s3.get_object.return_value = mock_response + mock_boto_client.return_value = mock_s3 + + result = get("test-uuid", base_opts) + + mock_boto_client.assert_called_once_with( + "s3", + aws_access_key_id="test-access-key", + aws_secret_access_key="test-secret-key", + region_name="eu-west-1", + ) + + def test_get_returns_none_on_error(self, base_opts): + """Test that get returns None on S3 error.""" + with patch("server.storage.s3.boto3.client") as mock_boto_client: + mock_s3 = MagicMock() + mock_s3.get_object.side_effect = Exception("S3 Error") + mock_boto_client.return_value = mock_s3 + + result = get("test-uuid", base_opts) + + assert result is None + + def test_get_returns_none_on_not_found(self, base_opts): + """Test that get returns None when object not found.""" + from botocore.exceptions import ClientError + + with patch("server.storage.s3.boto3.client") as mock_boto_client: + mock_s3 = MagicMock() + error_response = {"Error": {"Code": "NoSuchKey", "Message": "Not found"}} + mock_s3.get_object.side_effect = ClientError(error_response, "GetObject") + mock_boto_client.return_value = mock_s3 + + result = get("nonexistent-uuid", base_opts) + + assert result is None + + +class TestRegionErrorScenario: + """Test the specific error scenario that was reported.""" + + def test_without_region_uses_default(self): + """Test that without region, boto3 uses its default (us-east-1).""" + opts = { + "aws_access_key_id": "test-key", + "aws_secret_access_key": "test-secret", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + # Verify region_name is NOT passed when not specified + call_kwargs = mock_client.call_args.kwargs + assert "region_name" not in call_kwargs + + def test_with_region_passes_to_client(self): + """Test that specifying region passes it to boto3 client.""" + opts = { + "aws_access_key_id": "test-key", + "aws_secret_access_key": "test-secret", + "aws_region": "us-east-2", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + call_kwargs = mock_client.call_args.kwargs + assert "region_name" in call_kwargs + assert call_kwargs["region_name"] == "us-east-2" + + +class TestEndpointUrlScenarios: + """Test custom endpoint URL scenarios for S3-compatible services.""" + + def test_minio_endpoint(self): + """Test configuration for MinIO.""" + opts = { + "aws_access_key_id": "minioadmin", + "aws_secret_access_key": "minioadmin", + "endpoint_url": "http://localhost:9000", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + call_kwargs = mock_client.call_args.kwargs + assert call_kwargs["endpoint_url"] == "http://localhost:9000" + + def test_localstack_endpoint(self): + """Test configuration for LocalStack.""" + opts = { + "aws_access_key_id": "test", + "aws_secret_access_key": "test", + "endpoint_url": "http://localhost:4566", + "aws_region": "us-east-1", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + call_kwargs = mock_client.call_args.kwargs + assert call_kwargs["endpoint_url"] == "http://localhost:4566" + assert call_kwargs["region_name"] == "us-east-1" + + def test_digital_ocean_spaces_endpoint(self): + """Test configuration for DigitalOcean Spaces.""" + opts = { + "aws_access_key_id": "do-access-key", + "aws_secret_access_key": "do-secret-key", + "endpoint_url": "https://nyc3.digitaloceanspaces.com", + "aws_region": "nyc3", + } + + with patch("server.storage.s3.boto3.client") as mock_client: + _create_s3_client(opts) + + call_kwargs = mock_client.call_args.kwargs + assert call_kwargs["endpoint_url"] == "https://nyc3.digitaloceanspaces.com" + assert call_kwargs["region_name"] == "nyc3"