diff --git a/CHANGELOG.md b/CHANGELOG.md index b4b79a3..3f1a261 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,12 @@ # Changelog +## 1.4.0 + * Pass `region_name` configurable property while listing s3 objects + * [#64](https://github.com/singer-io/tap-s3-csv/pull/64) + ## 1.3.8 * Add Missing Type Information in JSON Schema - * [#55](https://github.com/singer-io/tap-s3-csv/pull/62) + * [#62](https://github.com/singer-io/tap-s3-csv/pull/62) ## 1.3.7 * Remove Backoff for Access Denied errors diff --git a/README.md b/README.md index fc0e19f..2019ca0 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Here is an example of basic config, and a bit of a run down on each of the prope "account_id": "1234567890", "role_name": "role_with_bucket_access", "bucket": "my-bucket", + "region_name": "eu-central-1", "external_id": "my_optional_secret_external_id", "tables": "[{\"search_prefix\":\"exports\",\"search_pattern\":\"my_table\\\\/.*\\\\.csv\",\"table_name\":\"my_table\",\"key_properties\":\"id\",\"date_overrides\":\"created_at\",\"delimiter\":\",\"}]", "request_timeout": 300 @@ -41,6 +42,7 @@ Here is an example of basic config, and a bit of a run down on each of the prope - **account_id**: This is your AWS account id - **role_name**: In order to access a bucket, the tap uses boto3 to assume a role in your AWS account. If you have your AWS account credentials set up locally, you can specify this as a role which your local user has access to assume, and boto3 should by default pick up your AWS keys from the local environment. - **bucket**: The name of the bucket to search for files under. +- **region_name**: The name of the region in which bucket is located. - **external_id**: (potentially optional) Running this locally, you should be able to omit this property, it is provided to allow the tap to access buckets in accounts where the user doesn't have access to the account itself, but is able to assume a role in that account, through a shared secret. This is that secret, in that case. - **tables**: An escaped JSON string that the tap will use to search for files, and emit records as "tables" from those files. Will be used by a [`voluptuous`](https://github.com/alecthomas/voluptuous)-based configuration checker. - **request_timeout**: (optional) The maximum time for which request should wait to get a response. Default request_timeout is 300 seconds. diff --git a/config.sample.json b/config.sample.json index 83d74a2..3bf65a5 100644 --- a/config.sample.json +++ b/config.sample.json @@ -3,6 +3,7 @@ "account_id": "1234567890", "role_name": "role_with_bucket_access", "bucket": "my-bucket", + "region_name": "eu-central-1", "external_id": "my_optional_secret_external_id", "tables": "[{\"search_prefix\":\"exports\",\"search_pattern\":\"my_table\\\\/.*\\\\.csv\",\"table_name\":\"my_table\",\"key_properties\":\"id\",\"date_overrides\":\"created_at\",\"delimiter\":\",\"}]", "request_timeout": 300 diff --git a/setup.py b/setup.py index 3dd26d6..d2d75aa 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup setup(name='tap-s3-csv', - version='1.3.8', + version='1.4.0', description='Singer.io tap for extracting CSV files from S3', author='Stitch', url='https://singer.io', diff --git a/tap_s3_csv/s3.py b/tap_s3_csv/s3.py index f728724..12eb5a4 100644 --- a/tap_s3_csv/s3.py +++ b/tap_s3_csv/s3.py @@ -489,7 +489,8 @@ def get_request_timeout(config): def list_files_in_bucket(config, search_prefix=None): # Set connect and read timeout for resource timeout = get_request_timeout(config) - client_config = Config(connect_timeout=timeout, read_timeout=timeout) + bucket_region_name = config.get('region_name') + client_config = Config(connect_timeout=timeout, read_timeout=timeout, region_name=bucket_region_name) s3_client = boto3.client('s3', config=client_config) s3_object_count = 0 diff --git a/tests/unittests/test_request_timeout.py b/tests/unittests/test_request_timeout.py index ab27577..8a9276d 100644 --- a/tests/unittests/test_request_timeout.py +++ b/tests/unittests/test_request_timeout.py @@ -13,108 +13,108 @@ def test_no_request_timeout_in_config(self, mocked_boto_config, mocked_client, m """ Verify that if request_timeout is not provided in config then default value is used """ - config = {"bucket": "test"} # No timeout in config + config = {"bucket": "test", "region_name": "region-name"} # No timeout in config # Call get_file_handle() which set timeout with Config object s3.get_file_handle(config, "test") # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300) + mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name") # Call list_files_in_bucket() which set timeout with Config object file_handles = list(s3.list_files_in_bucket(config, "test")) # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300) + mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name") def test_integer_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource): """ Verify that if request_timeout is provided in config(integer value) then it should be use """ - config = {"bucket": "test", "request_timeout": 100} # integer timeout in config + config = {"bucket": "test", "request_timeout": 100, "region_name": "region-name"} # integer timeout in config # Call get_file_handle() which set timeout with Config object s3.get_file_handle(config, "test") # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100) + mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100, region_name="region-name") config = {"bucket": "test", "request_timeout": 200} # integer timeout in config # Call list_files_in_bucket() which set timeout with Config object file_handles = list(s3.list_files_in_bucket(config, "test")) # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=200, read_timeout=200) + mocked_boto_config.assert_called_with(connect_timeout=200, read_timeout=200, region_name="region-name") def test_float_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource): """ Verify that if request_timeout is provided in config(float value) then it should be use """ - config = {"bucket": "test", "request_timeout": 100.5} # float timeout in config + config = {"bucket": "test", "request_timeout": 100.5, "region_name": "region-name"} # float timeout in config # Call get_file_handle() which set timeout with Config object s3.get_file_handle(config, "test") # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=100.5, read_timeout=100.5) + mocked_boto_config.assert_called_with(connect_timeout=100.5, read_timeout=100.5, region_name="region-name") config = {"bucket": "test", "request_timeout": 200.5} # float timeout in config # Call list_files_in_bucket() which set timeout with Config object file_handles = list(s3.list_files_in_bucket(config, "test")) # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=200.5, read_timeout=200.5) + mocked_boto_config.assert_called_with(connect_timeout=200.5, read_timeout=200.5, region_name="region-name") def test_string_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource): """ Verify that if request_timeout is provided in config(string value) then it should be use """ - config = {"bucket": "test", "request_timeout": '100'} # string format timeout in config + config = {"bucket": "test", "request_timeout": "100", "region_name": "region-name"} # string format timeout in config # Call get_file_handle() which set timeout with Config object s3.get_file_handle(config, "test") # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100) + mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100, region_name="region-name") # Call list_files_in_bucket() which set timeout with Config object file_handles = list(s3.list_files_in_bucket(config, "test")) # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100) + mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100, region_name="region-name") def test_empty_string_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource): """ Verify that if request_timeout is provided in config with empty string then default value is used """ - config = {"bucket": "test", "request_timeout": ''} # empty string in config + config = {"bucket": "test", "request_timeout": "", "region_name": "region-name"} # empty string in config # Call get_file_handle() which set timeout with Config object s3.get_file_handle(config, "test") # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300) + mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name") # Call list_files_in_bucket() which set timeout with Config object file_handles = list(s3.list_files_in_bucket(config, "test")) # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300) + mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name") def test_zero_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource): """ Verify that if request_timeout is provided in config with zero value then default value is used """ - config = {"bucket": "test", "request_timeout": 0.0} # zero value in config + config = {"bucket": "test", "request_timeout": 0.0, "region_name": "region-name"} # zero value in config # Call get_file_handle() which set timeout with Config object s3.get_file_handle(config, "test") # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300) + mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name") # Call list_files_in_bucket() which set timeout with Config object file_handles = list(s3.list_files_in_bucket(config, "test")) # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300) + mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name") def test_zero_string_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource): """ Verify that if request_timeout is provided in config with zero in string format then default value is used """ - config = {"bucket": "test", "request_timeout": '0.0'} # zero value in config + config = {"bucket": "test", "request_timeout": "0.0", "region_name": "region-name"} # zero value in config # Call get_file_handle() which set timeout with Config object s3.get_file_handle(config, "test") # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300) + mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name") # Call list_files_in_bucket() which set timeout with Config object file_handles = list(s3.list_files_in_bucket(config, "test")) # Verify Config is called with expected timeout - mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300) + mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name") # Mock objects for boto resource class MockObjectConnect: @@ -124,7 +124,7 @@ def get(): class MockBucketConnect: def Object(self): return MockObjectConnect - + class MockResourceConnect: def Bucket(self): return MockBucketConnect @@ -145,7 +145,7 @@ def test_connect_timeout_on_get_file_handle(self, mocked_boto_config, mocked_res s3.get_file_handle(config, "test") except ConnectTimeoutError as e: pass - + # Verify that resource ans Config object called 5 times self.assertEquals(mocked_resource.call_count, 5) self.assertEquals(mocked_boto_config.call_count, 5) @@ -157,14 +157,14 @@ def test_connect_timeout_on_make_request(self, mocked_sleep): # Mock PageIterator.method to raise ConnectTimeoutError error mocked_method = mock.Mock() mocked_method.side_effect = ConnectTimeoutError(endpoint_url="test") - + try: # Initialize PageIterator object and call _make_request function paginator = PageIterator(mocked_method, "", "", "", "", "", "", "", "", "", "") response = paginator._make_request({}) except ConnectTimeoutError as e: pass - + # Verify that PageIterator.method called 5 times self.assertEquals(mocked_method.call_count, 5) @@ -177,14 +177,14 @@ def get(): class MockBucketRead: def Object(self): return MockObjectRead - + class MockResourceRead: def Bucket(self): return MockBucketRead @mock.patch("time.sleep") class TestReadTimeoutErrorBackoff(unittest.TestCase): - + @mock.patch("boto3.resource") @mock.patch("tap_s3_csv.s3.Config") def test_read_timeout_on_get_file_handle(self, mocked_boto_config, mocked_resource, mocked_sleep): @@ -210,13 +210,13 @@ def test_read_timeout_on_make_request(self, mocked_sleep): # Mock PageIterator.method to raise ReadTimeoutError error mocked_method = mock.Mock() mocked_method.side_effect = ReadTimeoutError(endpoint_url="test") - + try: # Initialize PageIterator object and call _make_request function paginator = PageIterator(mocked_method, "", "", "", "", "", "", "", "", "", "") response = paginator._make_request({}) except ReadTimeoutError as e: pass - + # Verify that PageIterator.method called 5 times self.assertEquals(mocked_method.call_count, 5)