Skip to content

Commit 5243dab

Browse files
authored
(enhancement): Enable missing unit tests and Redshift, Athena, LF load tests (#1736)
* (enhancement): Enable missing unit tests and Redshift, Athena, LF load tests
1 parent ebd283d commit 5243dab

19 files changed

+223
-215
lines changed

CONTRIBUTING.md

Lines changed: 0 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -245,113 +245,6 @@ You can choose from three different environments to test your fixes/changes, bas
245245

246246
``./test_infra/scripts/delete-stack.sh databases``
247247

248-
## Ray Load Tests Environment
249-
**DISCLAIMER**: Make sure you know what you are doing. These steps will charge some services on your AWS account and require a minimum security skill to keep your environment safe.
250-
251-
* Pick up a Linux or MacOS.
252-
* Install Python 3.7, 3.8 or 3.9 with [poetry](https://github.com/python-poetry/poetry) for package management
253-
* Fork the AWS SDK for pandas repository and clone that into your development environment
254-
255-
* Then run the command bellow to install all dependencies:
256-
257-
``poetry install``
258-
259-
* Go to the ``test_infra`` directory
260-
261-
``cd test_infra``
262-
263-
* Install CDK dependencies:
264-
265-
``poetry install``
266-
267-
* [OPTIONAL] Set AWS_DEFAULT_REGION to define the region the Ray Test environment will deploy into. You may want to choose a region which you don't currently use:
268-
269-
``export AWS_DEFAULT_REGION=ap-northeast-1``
270-
271-
* Go to the ``scripts`` directory
272-
273-
``cd scripts``
274-
275-
* Deploy the `ray` CDK stack.
276-
277-
``./deploy-stack.sh ray``
278-
279-
* Configure Ray Cluster
280-
281-
``vi ray-cluster-config.yaml``
282-
283-
```
284-
# Update the following file to match your enviroment
285-
# The following is an example
286-
cluster_name: ray-cluster
287-
288-
min_workers: 2
289-
max_workers: 2
290-
291-
provider:
292-
type: aws
293-
region: us-east-1 # change region as required
294-
availability_zone: us-east-1a,us-east-1b,us-east-1c # change azs as required
295-
security_group:
296-
GroupName: ray_client_security_group
297-
cache_stopped_nodes: False
298-
299-
available_node_types:
300-
ray.head.default:
301-
node_config:
302-
InstanceType: r5n.2xlarge # change instance type as required
303-
IamInstanceProfile:
304-
Arn: arn:aws:iam::{UPDATE YOUR ACCOUNT ID HERE}:instance-profile/ray-cluster-instance-profile
305-
ImageId: ami-0ea510fcb67686b48 # latest ray images -> https://github.com/amzn/amazon-ray#amazon-ray-images
306-
SubnetId: {replace with subnet within above AZs}
307-
308-
ray.worker.default:
309-
min_workers: 2
310-
max_workers: 2
311-
node_config:
312-
InstanceType: r5n.2xlarge
313-
IamInstanceProfile:
314-
Arn: arn:aws:iam::{UPDATE YOUR ACCOUNT ID HERE}:instance-profile/ray-cluster-instance-profile
315-
ImageId: ami-0ea510fcb67686b48 # latest ray images -> https://github.com/amzn/amazon-ray#amazon-ray-images
316-
SubnetId: {replace with subnet within above AZs}
317-
318-
setup_commands:
319-
- pip install "awswrangler[modin, ray]==3.0.0rc1"
320-
- pip install pytest
321-
322-
```
323-
324-
* Create Ray Cluster
325-
``ray up -y ray-cluster-config.yaml``
326-
327-
* Push Load Tests to Ray Cluster
328-
``ray rsync-up ray-cluster-config.yaml tests/load /home/ubuntu/``
329-
330-
* Submit Pytest Run to Ray Cluster
331-
```
332-
echo '''
333-
import os
334-
335-
import pytest
336-
337-
args = "-v load/"
338-
339-
if not os.getenv("AWS_DEFAULT_REGION"):
340-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1" # Set your region as necessary
341-
342-
result = pytest.main(args.split(" "))
343-
344-
print(f"result: {result}")
345-
''' > handler.py
346-
ray submit ray-cluster-config.yaml handler.py
347-
```
348-
349-
* Teardown Cluster
350-
``ray down -y ray-cluster-config.yaml``
351-
352-
[More on launching Ray Clusters on AWS](https://docs.ray.io/en/master/cluster/vms/user-guides/launching-clusters/aws.html#)
353-
354-
355248
## Recommended Visual Studio Code Recommended setting
356249

357250
```json

awswrangler/lakeformation/_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def _build_table_objects(
4444
partitions_values: Dict[str, List[str]],
4545
use_threads: Union[bool, int],
4646
boto3_session: Optional[boto3.Session],
47-
) -> List[Dict[str, Any]]:
47+
) -> List[List[Dict[str, Any]]]:
4848
table_objects: List[Dict[str, Any]] = []
4949
paths_desc: Dict[str, Dict[str, Any]] = describe_objects(
5050
path=paths, use_threads=use_threads, boto3_session=boto3_session
@@ -58,7 +58,7 @@ def _build_table_objects(
5858
if partitions_values:
5959
table_object["PartitionValues"] = partitions_values[f"{path.rsplit('/', 1)[0].rstrip('/')}/"]
6060
table_objects.append(table_object)
61-
return table_objects
61+
return _utils.chunkify(table_objects, max_length=100) # LF write operations is limited to 100 objects per call
6262

6363

6464
def _get_table_objects(

awswrangler/redshift.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1582,7 +1582,6 @@ def copy( # pylint: disable=too-many-arguments,too-many-locals
15821582
e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'}
15831583
max_rows_by_file : int
15841584
Max number of rows in each file.
1585-
Default is None i.e. dont split the files.
15861585
(e.g. 33554432, 268435456)
15871586
precombine_key : str, optional
15881587
When there is a primary_key match during upsert, this column will change the upsert method,

awswrangler/s3/_write_dataset.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -316,19 +316,22 @@ def _to_dataset(
316316
_logger.debug("paths: %s", paths)
317317
_logger.debug("partitions_values: %s", partitions_values)
318318
if (table_type == "GOVERNED") and (table is not None) and (database is not None):
319-
add_objects: List[Dict[str, Any]] = lakeformation._build_table_objects( # pylint: disable=protected-access
319+
list_add_objects: List[
320+
List[Dict[str, Any]]
321+
] = lakeformation._build_table_objects( # pylint: disable=protected-access
320322
paths, partitions_values, use_threads=use_threads, boto3_session=boto3_session
321323
)
322324
try:
323-
if add_objects:
324-
lakeformation._update_table_objects( # pylint: disable=protected-access
325-
catalog_id=catalog_id,
326-
database=database,
327-
table=table,
328-
transaction_id=transaction_id, # type: ignore
329-
add_objects=add_objects,
330-
boto3_session=boto3_session,
331-
)
325+
if list_add_objects:
326+
for add_objects in list_add_objects:
327+
lakeformation._update_table_objects( # pylint: disable=protected-access
328+
catalog_id=catalog_id,
329+
database=database,
330+
table=table,
331+
transaction_id=transaction_id, # type: ignore
332+
add_objects=add_objects,
333+
boto3_session=boto3_session,
334+
)
332335
except Exception as ex:
333336
_logger.error(ex)
334337
raise

tests/conftest.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,3 +349,10 @@ def random_glue_database():
349349
database_name = get_time_str_with_random_suffix()
350350
yield database_name
351351
wr.catalog.delete_database(database_name)
352+
353+
354+
@pytest.fixture(scope="function")
355+
def redshift_con():
356+
con = wr.redshift.connect("aws-sdk-pandas-redshift")
357+
yield con
358+
con.close()

tests/load/test_database.py

Lines changed: 0 additions & 67 deletions
This file was deleted.

0 commit comments

Comments
 (0)