Skip to content

Commit 99f5e29

Browse files
authored
Merge pull request #127 from awslabs/spectrum
Add support to Redshift Spectrum on Glue metadata
2 parents 74896f0 + 3998cca commit 99f5e29

File tree

9 files changed

+220
-73
lines changed

9 files changed

+220
-73
lines changed

awswrangler/glue.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ def csv_partition_definition(partition, compression, extra_args=None):
379379
return {
380380
"StorageDescriptor": {
381381
"InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
382+
"OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
382383
"Location": partition[0],
383384
"Compressed": compressed,
384385
"SerdeInfo": {
@@ -440,7 +441,8 @@ def parquet_partition_definition(partition, compression):
440441
compressed = False if compression is None else True
441442
return {
442443
"StorageDescriptor": {
443-
"InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
444+
"InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
445+
"OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
444446
"Location": partition[0],
445447
"Compressed": compressed,
446448
"SerdeInfo": {

awswrangler/pandas.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def read_csv(self, path: str, max_result_size: Optional[int] = None, **pd_additi
6363
6464
:param path: Amazon S3 path (e.g. s3://bucket_name/key_name)
6565
:param max_result_size: Max number of bytes on each request to S3. It offers functionality similar to chunksize in pandas.read_csv(), but with higher performance
66-
:param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
66+
:param pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
6767
:return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
6868
"""
6969

@@ -1583,7 +1583,7 @@ def read_csv_list(
15831583
:param paths: List of Amazon S3 paths (e.g. ['s3://bucket_name/key_name1', 's3://bucket_name/key_name2'])
15841584
:param max_result_size: Max number of bytes on each request to S3. It offers functionality similar to chunksize in pandas.read_csv(), but with higher performance
15851585
:param procs_cpu_bound: Number of cores used for CPU bound tasks
1586-
:param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
1586+
:param pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
15871587
:return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
15881588
"""
15891589
if max_result_size is not None:
@@ -1636,7 +1636,7 @@ def _read_csv_list_iterator(self, paths: List[str], max_result_size=None, **pd_a
16361636
16371637
:param paths: List of Amazon S3 paths (e.g. ['s3://bucket_name/key_name1', 's3://bucket_name/key_name2'])
16381638
:param max_result_size: Max number of bytes on each request to S3. It offers functionality similar to chunksize in pandas.read_csv(), but with higher performance
1639-
:param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
1639+
:param pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
16401640
:return: Iterator of iterators of Pandas Dataframes
16411641
"""
16421642
for path in paths:

awswrangler/redshift.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def _validate_connection(database,
4444
password,
4545
tcp_keepalive=True,
4646
application_name="aws-data-wrangler-validation",
47-
validation_timeout=5):
47+
validation_timeout=10):
4848
conn = pg8000.connect(database=database,
4949
host=host,
5050
port=int(port),
@@ -66,7 +66,7 @@ def generate_connection(database,
6666
application_name="aws-data-wrangler",
6767
connection_timeout=1_200_000,
6868
statement_timeout=1_200_000,
69-
validation_timeout=5):
69+
validation_timeout=10):
7070
"""
7171
Generates a valid connection object to be passed to the load_table method
7272

testing/deploy-cloudformation.sh

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
set -e
33

44
aws cloudformation deploy \
5-
--template-file template.yaml \
6-
--stack-name aws-data-wrangler-test \
7-
--capabilities CAPABILITY_IAM \
8-
--parameter-overrides $(cat parameters.properties)
5+
--template-file template.yaml \
6+
--stack-name aws-data-wrangler-test \
7+
--capabilities CAPABILITY_IAM \
8+
--parameter-overrides $(cat parameters.properties)
9+
10+
aws cloudformation update-termination-protection \
11+
--enable-termination-protection \
12+
--stack-name aws-data-wrangler-test

testing/parameters.properties

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ VpcId=VPC_ID
22
SubnetId=SUBNET_ID
33
SubnetId2=SUBNET_ID2
44
SubnetAz=AVAILABILITY_ZONE
5-
Password=REDSHIFT_PASSWORD
6-
TestUser=AWS_USER_THAT_WILL_RUN_THE_TESTS_ON_CLI
5+
DatabasesPassword=REDSHIFT_PASSWORD
6+
AWSUserForTests=AWS_USER_THAT_WILL_RUN_THE_TESTS_ON_CLI

testing/template.yaml

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ Parameters:
1616
SubnetAz:
1717
Type: String
1818
Description: Subnet AZ
19-
Password:
19+
DatabasesPassword:
2020
Type: String
21-
Description: Redshift Password
22-
TestUser:
21+
Description: Password for all databases
22+
AWSUserForTests:
2323
Type: String
2424
Description: AWS User that will running the tests on the CLI
2525

@@ -49,7 +49,7 @@ Resources:
4949
- Sid: "Allow administration of the key"
5050
Effect: "Allow"
5151
Principal:
52-
AWS: !Join ["", ["arn:aws:iam::", !Ref "AWS::AccountId", ":user/", !Ref TestUser]]
52+
AWS: !Join ["", ["arn:aws:iam::", !Ref "AWS::AccountId", ":user/", !Ref AWSUserForTests]]
5353
Action:
5454
- "kms:Create*"
5555
- "kms:Describe*"
@@ -95,7 +95,7 @@ Resources:
9595
- sts:AssumeRole
9696
Path: "/"
9797
Policies:
98-
- PolicyName: S3GetAndList
98+
- PolicyName: Root
9999
PolicyDocument:
100100
Version: 2012-10-17
101101
Statement:
@@ -107,6 +107,30 @@ Resources:
107107
Resource:
108108
- !Join ['', ['arn:aws:s3:::', !Ref Bucket]]
109109
- !Join ['', ['arn:aws:s3:::', !Ref Bucket, /*]]
110+
- Effect: Allow
111+
Action:
112+
- "lakeformation:GrantPermissions"
113+
Resource: "*"
114+
- Effect: Allow
115+
Action:
116+
- "glue:SearchTables"
117+
- "glue:GetConnections"
118+
- "glue:GetDataCatalogEncryptionSettings"
119+
- "glue:GetTables"
120+
- "glue:GetTableVersions"
121+
- "glue:GetPartitions"
122+
- "glue:DeleteTableVersion"
123+
- "glue:BatchGetPartition"
124+
- "glue:GetDatabases"
125+
- "glue:GetTags"
126+
- "glue:GetTable"
127+
- "glue:GetDatabase"
128+
- "glue:GetPartition"
129+
- "glue:GetTableVersion"
130+
- "glue:GetConnection"
131+
- "glue:GetUserDefinedFunction"
132+
- "glue:GetUserDefinedFunctions"
133+
Resource: "*"
110134

111135
RedshiftSubnetGroup:
112136
Type: AWS::Redshift::ClusterSubnetGroup
@@ -140,7 +164,7 @@ Resources:
140164
Properties:
141165
DBName: test
142166
MasterUsername: test
143-
MasterUserPassword: !Ref Password
167+
MasterUserPassword: !Ref DatabasesPassword
144168
NodeType: dc2.large
145169
ClusterType: single-node
146170
VpcSecurityGroupIds:
@@ -223,7 +247,7 @@ Resources:
223247
Engine: aurora-postgresql
224248
DBClusterIdentifier : postgres-cluster-wrangler
225249
MasterUsername: test
226-
MasterUserPassword: !Ref Password
250+
MasterUserPassword: !Ref DatabasesPassword
227251
BackupRetentionPeriod: 1
228252
DBSubnetGroupName: !Ref RdsSubnetGroup
229253
VpcSecurityGroupIds:
@@ -264,19 +288,21 @@ Resources:
264288
Engine: aurora-mysql
265289
DBClusterIdentifier: mysql-cluster-wrangler
266290
MasterUsername: test
267-
MasterUserPassword: !Ref Password
291+
MasterUserPassword: !Ref DatabasesPassword
268292
BackupRetentionPeriod: 1
269293
DBSubnetGroupName: !Ref RdsSubnetGroup
270294
VpcSecurityGroupIds:
271295
- !Ref DatabaseSecurityGroup
272296
DBClusterParameterGroupName: !Ref MysqlParameterGroup
297+
DatabaseName: test
273298
AssociatedRoles:
274299
- RoleArn: !GetAtt AuroraRole.Arn
275300

276301
AuroraInstanceMysql:
277302
Type: AWS::RDS::DBInstance
278303
Properties:
279304
Engine: aurora-mysql
305+
# DBName: test
280306
DBInstanceIdentifier: mysql-instance-wrangler
281307
DBClusterIdentifier: !Ref AuroraClusterMysql
282308
DBInstanceClass: db.t3.medium
@@ -285,6 +311,9 @@ Resources:
285311

286312
RedshiftGlueConnection:
287313
Type: AWS::Glue::Connection
314+
DependsOn:
315+
- DatabaseSecurityGroup
316+
- Redshift
288317
Properties:
289318
CatalogId: !Ref AWS::AccountId
290319
ConnectionInput:
@@ -310,12 +339,15 @@ Resources:
310339
],
311340
],
312341
"USERNAME": test,
313-
"PASSWORD": !Ref Password,
342+
"PASSWORD": !Ref DatabasesPassword,
314343
}
315344
Name: "aws-data-wrangler-redshift"
316345

317346
PostgresGlueConnection:
318347
Type: AWS::Glue::Connection
348+
DependsOn:
349+
- DatabaseSecurityGroup
350+
- AuroraInstancePostgres
319351
Properties:
320352
CatalogId: !Ref AWS::AccountId
321353
ConnectionInput:
@@ -341,12 +373,15 @@ Resources:
341373
],
342374
],
343375
"USERNAME": test,
344-
"PASSWORD": !Ref Password,
376+
"PASSWORD": !Ref DatabasesPassword,
345377
}
346378
Name: "aws-data-wrangler-postgres"
347379

348380
MysqlGlueConnection:
349381
Type: AWS::Glue::Connection
382+
DependsOn:
383+
- DatabaseSecurityGroup
384+
- AuroraInstanceMysql
350385
Properties:
351386
CatalogId: !Ref AWS::AccountId
352387
ConnectionInput:
@@ -372,7 +407,7 @@ Resources:
372407
],
373408
],
374409
"USERNAME": test,
375-
"PASSWORD": !Ref Password,
410+
"PASSWORD": !Ref DatabasesPassword,
376411
}
377412
Name: "aws-data-wrangler-mysql"
378413

@@ -398,12 +433,12 @@ Outputs:
398433
Description: Name of the S3 Bucket used for tests.
399434
RedshiftAddress:
400435
Value: !GetAtt Redshift.Endpoint.Address
401-
Description: Redshift Password.
436+
Description: Redshift address.
402437
RedshiftPort:
403438
Value: !GetAtt Redshift.Endpoint.Port
404439
Description: Redshift Endpoint Port.
405-
Password:
406-
Value: !Ref Password
440+
DatabasesPassword:
441+
Value: !Ref DatabasesPassword
407442
Description: Password.
408443
RedshiftRole:
409444
Value: !GetAtt RedshiftRole.Arn
@@ -434,4 +469,7 @@ Outputs:
434469
Description: Mysql Address
435470
DynamoDbTableARN:
436471
Value: !GetAtt DynamoDBTable.Arn
437-
Description: DynamoDB table name
472+
Description: DynamoDB table name
473+
Region:
474+
Value: !Ref AWS::Region
475+
Description: AWS Region

testing/test_awswrangler/test_aurora.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def postgres_parameters(cloudformation_outputs):
2626
postgres_parameters["PostgresAddress"] = cloudformation_outputs.get("PostgresAddress")
2727
else:
2828
raise Exception("You must deploy the test infrastructure using SAM!")
29-
if "Password" in cloudformation_outputs:
30-
postgres_parameters["Password"] = cloudformation_outputs.get("Password")
29+
if "DatabasesPassword" in cloudformation_outputs:
30+
postgres_parameters["DatabasesPassword"] = cloudformation_outputs.get("DatabasesPassword")
3131
else:
3232
raise Exception("You must deploy the test infrastructure using SAM!")
3333
yield postgres_parameters
@@ -40,8 +40,8 @@ def mysql_parameters(cloudformation_outputs):
4040
mysql_parameters["MysqlAddress"] = cloudformation_outputs.get("MysqlAddress")
4141
else:
4242
raise Exception("You must deploy the test infrastructure using SAM!")
43-
if "Password" in cloudformation_outputs:
44-
mysql_parameters["Password"] = cloudformation_outputs.get("Password")
43+
if "DatabasesPassword" in cloudformation_outputs:
44+
mysql_parameters["DatabasesPassword"] = cloudformation_outputs.get("DatabasesPassword")
4545
else:
4646
raise Exception("You must deploy the test infrastructure using SAM!")
4747
yield mysql_parameters
@@ -52,7 +52,7 @@ def test_postgres_connection(postgres_parameters):
5252
host=postgres_parameters["PostgresAddress"],
5353
port=3306,
5454
user="test",
55-
password=postgres_parameters["Password"],
55+
password=postgres_parameters["DatabasesPassword"],
5656
engine="postgres")
5757
cursor = conn.cursor()
5858
cursor.execute("SELECT 1 + 2, 3 + 4")
@@ -68,7 +68,7 @@ def test_mysql_connection(mysql_parameters):
6868
host=mysql_parameters["MysqlAddress"],
6969
port=3306,
7070
user="test",
71-
password=mysql_parameters["Password"],
71+
password=mysql_parameters["DatabasesPassword"],
7272
engine="mysql")
7373
cursor = conn.cursor()
7474
cursor.execute("SELECT 1 + 2, 3 + 4")
@@ -85,5 +85,5 @@ def test_invalid_engine(mysql_parameters):
8585
host=mysql_parameters["MysqlAddress"],
8686
port=3306,
8787
user="test",
88-
password=mysql_parameters["Password"],
88+
password=mysql_parameters["DatabasesPassword"],
8989
engine="foo")

0 commit comments

Comments
 (0)