Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# Changelog

## 1.4.0
* Pass `region_name` configurable property while listing s3 objects
* [#64](https://github.com/singer-io/tap-s3-csv/pull/64)

## 1.3.8
* Add Missing Type Information in JSON Schema
* [#55](https://github.com/singer-io/tap-s3-csv/pull/62)
* [#62](https://github.com/singer-io/tap-s3-csv/pull/62)

## 1.3.7
* Remove Backoff for Access Denied errors
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Here is an example of basic config, and a bit of a run down on each of the prope
"account_id": "1234567890",
"role_name": "role_with_bucket_access",
"bucket": "my-bucket",
"region_name": "eu-central-1",
"external_id": "my_optional_secret_external_id",
"tables": "[{\"search_prefix\":\"exports\",\"search_pattern\":\"my_table\\\\/.*\\\\.csv\",\"table_name\":\"my_table\",\"key_properties\":\"id\",\"date_overrides\":\"created_at\",\"delimiter\":\",\"}]",
"request_timeout": 300
Expand All @@ -41,6 +42,7 @@ Here is an example of basic config, and a bit of a run down on each of the prope
- **account_id**: This is your AWS account id
- **role_name**: In order to access a bucket, the tap uses boto3 to assume a role in your AWS account. If you have your AWS account credentials set up locally, you can specify this as a role which your local user has access to assume, and boto3 should by default pick up your AWS keys from the local environment.
- **bucket**: The name of the bucket to search for files under.
- **region_name**: The name of the region in which bucket is located.
- **external_id**: (potentially optional) Running this locally, you should be able to omit this property, it is provided to allow the tap to access buckets in accounts where the user doesn't have access to the account itself, but is able to assume a role in that account, through a shared secret. This is that secret, in that case.
- **tables**: An escaped JSON string that the tap will use to search for files, and emit records as "tables" from those files. Will be used by a [`voluptuous`](https://github.com/alecthomas/voluptuous)-based configuration checker.
- **request_timeout**: (optional) The maximum time for which request should wait to get a response. Default request_timeout is 300 seconds.
Expand Down
1 change: 1 addition & 0 deletions config.sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"account_id": "1234567890",
"role_name": "role_with_bucket_access",
"bucket": "my-bucket",
"region_name": "eu-central-1",
"external_id": "my_optional_secret_external_id",
"tables": "[{\"search_prefix\":\"exports\",\"search_pattern\":\"my_table\\\\/.*\\\\.csv\",\"table_name\":\"my_table\",\"key_properties\":\"id\",\"date_overrides\":\"created_at\",\"delimiter\":\",\"}]",
"request_timeout": 300
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from setuptools import setup

setup(name='tap-s3-csv',
version='1.3.8',
version='1.4.0',
description='Singer.io tap for extracting CSV files from S3',
author='Stitch',
url='https://singer.io',
Expand Down
3 changes: 2 additions & 1 deletion tap_s3_csv/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,8 @@ def get_request_timeout(config):
def list_files_in_bucket(config, search_prefix=None):
# Set connect and read timeout for resource
timeout = get_request_timeout(config)
client_config = Config(connect_timeout=timeout, read_timeout=timeout)
bucket_region_name = config.get('region_name')
client_config = Config(connect_timeout=timeout, read_timeout=timeout, region_name=bucket_region_name)
s3_client = boto3.client('s3', config=client_config)

s3_object_count = 0
Expand Down
58 changes: 29 additions & 29 deletions tests/unittests/test_request_timeout.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,108 +13,108 @@ def test_no_request_timeout_in_config(self, mocked_boto_config, mocked_client, m
"""
Verify that if request_timeout is not provided in config then default value is used
"""
config = {"bucket": "test"} # No timeout in config
config = {"bucket": "test", "region_name": "region-name"} # No timeout in config
# Call get_file_handle() which set timeout with Config object
s3.get_file_handle(config, "test")
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300)
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name")

# Call list_files_in_bucket() which set timeout with Config object
file_handles = list(s3.list_files_in_bucket(config, "test"))
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300)
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name")

def test_integer_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource):
"""
Verify that if request_timeout is provided in config(integer value) then it should be use
"""
config = {"bucket": "test", "request_timeout": 100} # integer timeout in config
config = {"bucket": "test", "request_timeout": 100, "region_name": "region-name"} # integer timeout in config
# Call get_file_handle() which set timeout with Config object
s3.get_file_handle(config, "test")
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100)
mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100, region_name="region-name")

config = {"bucket": "test", "request_timeout": 200} # integer timeout in config
# Call list_files_in_bucket() which set timeout with Config object
file_handles = list(s3.list_files_in_bucket(config, "test"))
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=200, read_timeout=200)
mocked_boto_config.assert_called_with(connect_timeout=200, read_timeout=200, region_name="region-name")

def test_float_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource):
"""
Verify that if request_timeout is provided in config(float value) then it should be use
"""
config = {"bucket": "test", "request_timeout": 100.5} # float timeout in config
config = {"bucket": "test", "request_timeout": 100.5, "region_name": "region-name"} # float timeout in config
# Call get_file_handle() which set timeout with Config object
s3.get_file_handle(config, "test")
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=100.5, read_timeout=100.5)
mocked_boto_config.assert_called_with(connect_timeout=100.5, read_timeout=100.5, region_name="region-name")

config = {"bucket": "test", "request_timeout": 200.5} # float timeout in config
# Call list_files_in_bucket() which set timeout with Config object
file_handles = list(s3.list_files_in_bucket(config, "test"))
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=200.5, read_timeout=200.5)
mocked_boto_config.assert_called_with(connect_timeout=200.5, read_timeout=200.5, region_name="region-name")

def test_string_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource):
"""
Verify that if request_timeout is provided in config(string value) then it should be use
"""
config = {"bucket": "test", "request_timeout": '100'} # string format timeout in config
config = {"bucket": "test", "request_timeout": "100", "region_name": "region-name"} # string format timeout in config
# Call get_file_handle() which set timeout with Config object
s3.get_file_handle(config, "test")
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100)
mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100, region_name="region-name")

# Call list_files_in_bucket() which set timeout with Config object
file_handles = list(s3.list_files_in_bucket(config, "test"))
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100)
mocked_boto_config.assert_called_with(connect_timeout=100, read_timeout=100, region_name="region-name")

def test_empty_string_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource):
"""
Verify that if request_timeout is provided in config with empty string then default value is used
"""
config = {"bucket": "test", "request_timeout": ''} # empty string in config
config = {"bucket": "test", "request_timeout": "", "region_name": "region-name"} # empty string in config
# Call get_file_handle() which set timeout with Config object
s3.get_file_handle(config, "test")
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300)
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name")

# Call list_files_in_bucket() which set timeout with Config object
file_handles = list(s3.list_files_in_bucket(config, "test"))
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300)
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name")

def test_zero_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource):
"""
Verify that if request_timeout is provided in config with zero value then default value is used
"""
config = {"bucket": "test", "request_timeout": 0.0} # zero value in config
config = {"bucket": "test", "request_timeout": 0.0, "region_name": "region-name"} # zero value in config
# Call get_file_handle() which set timeout with Config object
s3.get_file_handle(config, "test")
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300)
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name")

# Call list_files_in_bucket() which set timeout with Config object
file_handles = list(s3.list_files_in_bucket(config, "test"))
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300)
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name")

def test_zero_string_request_timeout_in_config(self, mocked_boto_config, mocked_client, mocked_resource):
"""
Verify that if request_timeout is provided in config with zero in string format then default value is used
"""
config = {"bucket": "test", "request_timeout": '0.0'} # zero value in config
config = {"bucket": "test", "request_timeout": "0.0", "region_name": "region-name"} # zero value in config
# Call get_file_handle() which set timeout with Config object
s3.get_file_handle(config, "test")
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300)
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name")

# Call list_files_in_bucket() which set timeout with Config object
file_handles = list(s3.list_files_in_bucket(config, "test"))
# Verify Config is called with expected timeout
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300)
mocked_boto_config.assert_called_with(connect_timeout=300, read_timeout=300, region_name="region-name")

# Mock objects for boto resource
class MockObjectConnect:
Expand All @@ -124,7 +124,7 @@ def get():
class MockBucketConnect:
def Object(self):
return MockObjectConnect

class MockResourceConnect:
def Bucket(self):
return MockBucketConnect
Expand All @@ -145,7 +145,7 @@ def test_connect_timeout_on_get_file_handle(self, mocked_boto_config, mocked_res
s3.get_file_handle(config, "test")
except ConnectTimeoutError as e:
pass

# Verify that resource ans Config object called 5 times
self.assertEquals(mocked_resource.call_count, 5)
self.assertEquals(mocked_boto_config.call_count, 5)
Expand All @@ -157,14 +157,14 @@ def test_connect_timeout_on_make_request(self, mocked_sleep):
# Mock PageIterator.method to raise ConnectTimeoutError error
mocked_method = mock.Mock()
mocked_method.side_effect = ConnectTimeoutError(endpoint_url="test")

try:
# Initialize PageIterator object and call _make_request function
paginator = PageIterator(mocked_method, "", "", "", "", "", "", "", "", "", "")
response = paginator._make_request({})
except ConnectTimeoutError as e:
pass

# Verify that PageIterator.method called 5 times
self.assertEquals(mocked_method.call_count, 5)

Expand All @@ -177,14 +177,14 @@ def get():
class MockBucketRead:
def Object(self):
return MockObjectRead

class MockResourceRead:
def Bucket(self):
return MockBucketRead

@mock.patch("time.sleep")
class TestReadTimeoutErrorBackoff(unittest.TestCase):

@mock.patch("boto3.resource")
@mock.patch("tap_s3_csv.s3.Config")
def test_read_timeout_on_get_file_handle(self, mocked_boto_config, mocked_resource, mocked_sleep):
Expand All @@ -210,13 +210,13 @@ def test_read_timeout_on_make_request(self, mocked_sleep):
# Mock PageIterator.method to raise ReadTimeoutError error
mocked_method = mock.Mock()
mocked_method.side_effect = ReadTimeoutError(endpoint_url="test")

try:
# Initialize PageIterator object and call _make_request function
paginator = PageIterator(mocked_method, "", "", "", "", "", "", "", "", "", "")
response = paginator._make_request({})
except ReadTimeoutError as e:
pass

# Verify that PageIterator.method called 5 times
self.assertEquals(mocked_method.call_count, 5)