Skip to content

Commit 8b49531

Browse files
committed
Add distribution and sorting configuration in Redshift integration.
1 parent 6066d19 commit 8b49531

File tree

9 files changed

+336
-24
lines changed

9 files changed

+336
-24
lines changed

awswrangler/exceptions.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,19 @@ class LineTerminatorNotFound(Exception):
3636

3737
class MissingBatchDetected(Exception):
3838
pass
39+
40+
41+
class InvalidRedshiftDiststyle(Exception):
42+
pass
43+
44+
45+
class InvalidRedshiftDistkey(Exception):
46+
pass
47+
48+
49+
class InvalidRedshiftSortstyle(Exception):
50+
pass
51+
52+
53+
class InvalidRedshiftSortkey(Exception):
54+
pass

awswrangler/pandas.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,10 @@ def to_redshift(
760760
schema,
761761
table,
762762
iam_role,
763+
diststyle="AUTO",
764+
distkey=None,
765+
sortstyle="COMPOUND",
766+
sortkey=None,
763767
preserve_index=False,
764768
mode="append",
765769
):
@@ -771,6 +775,12 @@ def to_redshift(
771775
:param schema: The Redshift Schema for the table
772776
:param table: The name of the desired Redshift table
773777
:param iam_role: AWS IAM role with the related permissions
778+
:param diststyle: Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]
779+
https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
780+
:param distkey: Specifies a column name or positional number for the distribution key
781+
:param sortstyle: Sorting can be "COMPOUND" or "INTERLEAVED"
782+
https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html
783+
:param sortkey: List of columns to be sorted
774784
:param preserve_index: Should we preserve the Dataframe index?
775785
:param mode: append or overwrite
776786
:return: None
@@ -779,7 +789,7 @@ def to_redshift(
779789
path += "/"
780790
self._session.s3.delete_objects(path=path)
781791
num_rows = len(dataframe.index)
782-
logger.info(f"Number of rows: {num_rows}")
792+
logger.debug(f"Number of rows: {num_rows}")
783793
if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
784794
num_partitions = 1
785795
else:
@@ -808,6 +818,10 @@ def to_redshift(
808818
preserve_index=False,
809819
num_files=num_partitions,
810820
iam_role=iam_role,
821+
diststyle=diststyle,
822+
distkey=distkey,
823+
sortstyle=sortstyle,
824+
sortkey=sortkey,
811825
mode=mode,
812826
)
813827
self._session.s3.delete_objects(path=path)

awswrangler/redshift.py

Lines changed: 154 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,26 @@
77
RedshiftLoadError,
88
UnsupportedType,
99
InvalidDataframeType,
10+
InvalidRedshiftDiststyle,
11+
InvalidRedshiftDistkey,
12+
InvalidRedshiftSortstyle,
13+
InvalidRedshiftSortkey,
1014
)
1115

1216
logger = logging.getLogger(__name__)
1317

18+
DISTSTYLES = [
19+
"AUTO",
20+
"EVEN",
21+
"ALL",
22+
"KEY",
23+
]
24+
25+
SORTSTYLES = [
26+
"COMPOUND",
27+
"INTERLEAVED",
28+
]
29+
1430

1531
class Redshift:
1632
def __init__(self, session):
@@ -87,24 +103,48 @@ def load_table(
87103
redshift_conn,
88104
num_files,
89105
iam_role,
106+
diststyle="AUTO",
107+
distkey=None,
108+
sortstyle="COMPOUND",
109+
sortkey=None,
90110
mode="append",
91111
preserve_index=False,
92112
):
113+
"""
114+
Load Parquet files into a Redshift table using a manifest file.
115+
Creates the table if necessary.
116+
:param dataframe: Pandas or Spark Dataframe
117+
:param dataframe_type: "pandas" or "spark"
118+
:param manifest_path: S3 path for manifest file (E.g. S3://...)
119+
:param schema_name: Redshift schema
120+
:param table_name: Redshift table name
121+
:param redshift_conn: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
122+
:param num_files: Number of files to be loaded
123+
:param iam_role: AWS IAM role with the related permissions
124+
:param diststyle: Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]
125+
https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
126+
:param distkey: Specifies a column name or positional number for the distribution key
127+
:param sortstyle: Sorting can be "COMPOUND" or "INTERLEAVED"
128+
https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html
129+
:param sortkey: List of columns to be sorted
130+
:param mode: append or overwrite
131+
:param preserve_index: Should we preserve the Dataframe index? (ONLY for Pandas Dataframe)
132+
:return: None
133+
"""
93134
cursor = redshift_conn.cursor()
94135
if mode == "overwrite":
95-
cursor.execute("-- AWS DATA WRANGLER\n"
96-
f"DROP TABLE IF EXISTS {schema_name}.{table_name}")
97-
schema = Redshift._get_redshift_schema(
136+
Redshift._create_table(
137+
cursor=cursor,
98138
dataframe=dataframe,
99139
dataframe_type=dataframe_type,
140+
schema_name=schema_name,
141+
table_name=table_name,
142+
diststyle=diststyle,
143+
distkey=distkey,
144+
sortstyle=sortstyle,
145+
sortkey=sortkey,
100146
preserve_index=preserve_index,
101147
)
102-
cols_str = "".join([f"{col[0]} {col[1]},\n" for col in schema])[:-2]
103-
sql = (
104-
"-- AWS DATA WRANGLER\n"
105-
f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (\n{cols_str}"
106-
") DISTSTYLE AUTO")
107-
cursor.execute(sql)
108148
sql = ("-- AWS DATA WRANGLER\n"
109149
f"COPY {schema_name}.{table_name} FROM '{manifest_path}'\n"
110150
f"IAM_ROLE '{iam_role}'\n"
@@ -129,6 +169,111 @@ def load_table(
129169
redshift_conn.commit()
130170
cursor.close()
131171

172+
@staticmethod
173+
def _create_table(
174+
cursor,
175+
dataframe,
176+
dataframe_type,
177+
schema_name,
178+
table_name,
179+
diststyle="AUTO",
180+
distkey=None,
181+
sortstyle="COMPOUND",
182+
sortkey=None,
183+
preserve_index=False,
184+
):
185+
"""
186+
Creates Redshift table.
187+
:param cursor: A PEP 249 compatible cursor
188+
:param dataframe: Pandas or Spark Dataframe
189+
:param dataframe_type: "pandas" or "spark"
190+
:param schema_name: Redshift schema
191+
:param table_name: Redshift table name
192+
:param diststyle: Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]
193+
https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
194+
:param distkey: Specifies a column name or positional number for the distribution key
195+
:param sortstyle: Sorting can be "COMPOUND" or "INTERLEAVED"
196+
https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html
197+
:param sortkey: List of columns to be sorted
198+
:param preserve_index: Should we preserve the Dataframe index? (ONLY for Pandas Dataframe)
199+
:return: None
200+
"""
201+
sql = f"-- AWS DATA WRANGLER\n" \
202+
f"DROP TABLE IF EXISTS {schema_name}.{table_name}"
203+
logger.debug(f"Drop table query:\n{sql}")
204+
cursor.execute(sql)
205+
schema = Redshift._get_redshift_schema(
206+
dataframe=dataframe,
207+
dataframe_type=dataframe_type,
208+
preserve_index=preserve_index,
209+
)
210+
if diststyle:
211+
diststyle = diststyle.upper()
212+
else:
213+
diststyle = "AUTO"
214+
if sortstyle:
215+
sortstyle = sortstyle.upper()
216+
else:
217+
sortstyle = "COMPOUND"
218+
Redshift._validate_parameters(schema=schema,
219+
diststyle=diststyle,
220+
distkey=distkey,
221+
sortstyle=sortstyle,
222+
sortkey=sortkey)
223+
cols_str = "".join([f"{col[0]} {col[1]},\n" for col in schema])[:-2]
224+
distkey_str = ""
225+
if distkey and diststyle == "KEY":
226+
distkey_str = f"\nDISTKEY({distkey})"
227+
sortkey_str = ""
228+
if sortkey:
229+
sortkey_str = f"\n{sortstyle} SORTKEY({','.join(sortkey)})"
230+
sql = (f"-- AWS DATA WRANGLER\n"
231+
f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (\n"
232+
f"{cols_str}"
233+
f")\nDISTSTYLE {diststyle}"
234+
f"{distkey_str}"
235+
f"{sortkey_str}")
236+
logger.debug(f"Create table query:\n{sql}")
237+
cursor.execute(sql)
238+
239+
@staticmethod
240+
def _validate_parameters(schema, diststyle, distkey, sortstyle, sortkey):
241+
"""
242+
Validates the sanity of Redshift's parameters
243+
:param schema: List of tuples (column name, column type)
244+
:param diststyle: Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]
245+
https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
246+
:param distkey: Specifies a column name or positional number for the distribution key
247+
:param sortstyle: Sorting can be "COMPOUND" or "INTERLEAVED"
248+
https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html
249+
:param sortkey: List of columns to be sorted
250+
:return: None
251+
"""
252+
if diststyle not in DISTSTYLES:
253+
raise InvalidRedshiftDiststyle(
254+
f"diststyle must be in {DISTSTYLES}")
255+
cols = [x[0] for x in schema]
256+
logger.debug(f"Redshift columns: {cols}")
257+
if (diststyle == "KEY") and (not distkey):
258+
raise InvalidRedshiftDistkey(
259+
"You must pass a distkey if you intend to use KEY diststyle")
260+
if distkey and distkey not in cols:
261+
raise InvalidRedshiftDistkey(
262+
f"distkey ({distkey}) must be in the columns list: {cols})")
263+
if sortstyle and sortstyle not in SORTSTYLES:
264+
raise InvalidRedshiftSortstyle(
265+
f"sortstyle must be in {SORTSTYLES}")
266+
if sortkey:
267+
if type(sortkey) != list:
268+
raise InvalidRedshiftSortkey(
269+
f"sortkey must be a List of items in the columns list: {cols}. "
270+
f"Currently value: {sortkey}")
271+
for key in sortkey:
272+
if key not in cols:
273+
raise InvalidRedshiftSortkey(
274+
f"sortkey must be a List of items in the columns list: {cols}. "
275+
f"Currently value: {key}")
276+
132277
@staticmethod
133278
def _get_redshift_schema(dataframe, dataframe_type, preserve_index=False):
134279
schema_built = []

awswrangler/spark.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,31 @@ def to_redshift(
2828
schema,
2929
table,
3030
iam_role,
31+
diststyle="AUTO",
32+
distkey=None,
33+
sortstyle="COMPOUND",
34+
sortkey=None,
3135
min_num_partitions=200,
3236
mode="append",
3337
):
38+
"""
39+
Load Spark Dataframe as a Table on Amazon Redshift
40+
:param dataframe: Pandas Dataframe
41+
:param path: S3 path to write temporary files (E.g. s3://BUCKET_NAME/ANY_NAME/)
42+
:param connection: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
43+
:param schema: The Redshift Schema for the table
44+
:param table: The name of the desired Redshift table
45+
:param iam_role: AWS IAM role with the related permissions
46+
:param diststyle: Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]
47+
https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
48+
:param distkey: Specifies a column name or positional number for the distribution key
49+
:param sortstyle: Sorting can be "COMPOUND" or "INTERLEAVED"
50+
https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html
51+
:param sortkey: List of columns to be sorted
52+
:param min_num_partitions: Minimal number of partitions
53+
:param mode: append or overwrite
54+
:return: None
55+
"""
3456
logger.debug(f"Minimum number of partitions : {min_num_partitions}")
3557
self._session.s3.delete_objects(path=path)
3658
num_slices = self._session.redshift.get_number_of_slices(
@@ -90,6 +112,10 @@ def write(pandas_dataframe):
90112
preserve_index=False,
91113
num_files=num_partitions,
92114
iam_role=iam_role,
115+
diststyle=diststyle,
116+
distkey=distkey,
117+
sortstyle=sortstyle,
118+
sortkey=sortkey,
93119
mode=mode,
94120
)
95121
dataframe.unpersist()
109 KB
Loading
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<mxfile modified="2019-08-12T13:40:33.978Z" host="www.draw.io" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" etag="Og3SVW1iHjwL1TZe0QbU" version="11.1.4" type="google"><diagram id="uhJXyVCDerINJ9iByMH9" name="Page-1">7V3Rcpu6Fv2azNzz4AwIEPgxiZMmc9smJ06n7VNGBtmmxeCC3Nj9+isJBAhhh9QG0/rmITFCCKG9lvbeS8I5M64W63cxWs4/RB4OzoDmrc+M0RkAuq4b9A8r2aQlNgRpwSz2vaxSUTD2f+GsUMtKV76HE6kiiaKA+Eu50I3CELtEKkNxHL3I1aZRIN91iWZYKRi7KFBLP/semaelDrCL8lvsz+bizjocpmcWSFTOniSZIy96KRUZ12fGVRxFJP20WF/hgA2eGJf0upstZ/OOxTgkTS4Ibx/0FdLvH759Ag/Jx9vRlDwP9MwaCdmIJ8YeHYDsMIxC+ucyjlahh1k7Gj2KYjKPZlGIgvdRtKSFOi38hgnZZOZDKxLRojlZBNlZ2sV484Vdf26Jw69Zc/xgtJaONtlRQlBMLpgZi87wshs/CLI66jhkQ5NEq9jFOx7ezPCE4hkmuwYpa5CNTOkO2TC/w9EC007TCjEOEPF/ytBBGQJneb3CSPRDZqc32Czr9k8UrLI7vaPdB1q4WkxwTD9EU/b0ge9S4gAY0Ee7nNByOGOf/JCefMReMven7KqrYJUQdlkFBrLNX+Y+weMl4uP5Qqku23erDX7imOD1zkHLzkLAsMEvyqYKw3LS45eCeLpg07xEOqi1NNJ2J+RoF+RWQ5DDXmHcUjA+wgEmmPWQYmpD5n44Y42ECfUP7PnmmHfOxbRvngp7ghfLKEb8EcYGH0uKqWODHlgy5IFuK5AHdg3kDbOlgYfmCTsE2JArTq+4AhWuXFHzkJwUsl/gIREjjjhfZcoDCj3EqowQQdMYLfDxaSLgLmjiqDQxhJcu0wQMWxpzEZ6WBj0fSTGQomAacWMU4wd/rCJxYpBwilzQCrq1XBcnRSsfI9EO7WjalNw8LS7dsgvurn3ypfT5a0FjelQQlx10w9umfNzIuUVjemYtPUQ+t6SIWPRhNWJxHLmRdCbJrivH5qIpUTGaThNMFFjmfd8DqU6noNDeAIp+OgGnoRPQrUN7AX4pfS60KVVYMqQk21Gob0VhAaS0zYPCCnQ2AX4tspjfmQGpiyEyVGNMb4kmvAKDSjbCtLZ1eWaNaAkK/FlIC1wKHuo5jUvmqnwXBRfZiYXveez6ywBNcHCJ3O8zzqCrKIhifl9jyn9qkbiTp4pXzLWNrMtnZfmgzlsOtHNzaNoHmeMGQIJWrhe0P285W/G1jPEWhE2RKwPsFgc/MbNcGVYNgAgYENNpRJvk1h24qXlZlXg2+Q+P3qkt6ANqlc//1DjymjCsGnvRv2jB4qeg/HTF+Q93H58/fvpwef34fH/z/Hj/efz8dP88uhs/Pd5dfnq6rm0xL7mpK1SotIVh0qhX48F5tJiskm5iQXW+M1SdwDBrkiZda0soMLpRCvrpMIcNHabRLxltqEwwn2OfKwxsgID2gOIfKy6sTf2AFSw5e+kIEZ/4EcufaKZEf894JTZN5CkXVxi+441gO0buPGvofDdHsxzsfEzraZ9ZWy5itwojko4Mb5OVBH5CBmhKHdTgJe84sxe7N0WO0jE/STuVnB8/oXM0icKmVaN71El9TlsMFusUf7TWJ1LgV6k47BUTRbdLVJSd5YPgXJ3L5H68UihfPs6k8N4p3CZUYd+twq137bgkt1V4saM4Lr2pNt4zuqji+J500RUQdM0NqxrUHZ8aAHZLDb1P1ACgITX6IYJAR5fQAwxQsX8bEghQeHjheQlPA4vk70ZKPQvZXcMseKwykfM3XZpK48fe+SxdM45NTKsbYgpFM1cxv5bO/EmKJjCOReb97KxmaP+XGPeRGAVxDiIxZpjaV19U1BwBwvYVRnDKsW/jOUF4ub5MCtt14V4tvP4xc4J9wDlBt6AcKxxkjqhGIHkC1MEUYWxFW7LkupwKLh6UDZI0KmPwCqN4gQIVYKOIuR1ty86kamQ453sy0FuiS+OmBNW0v30Q8IfVKd+u2efn1On3emv6fUeL5L0MD42mqqFh9MoVGKpsmG6ETRnA5/dME+e8YrlVlVWI34bp5wSzikyhP75SaIAqRayme2GNtnY8mapIdEIUaRotGf1KoQzVf4lFrpQkCxT6U5yQ828JJ0rBlhefixBVgkSTbzQm4K6IL37RZvxYPA/beMt5d3wKAcesUAiKsKG8xqR3SSFLU4blhCjUVGw3+6W2G6rafioUghUC2VAlkFVDILOtMM3sSPnrZcZuNpXkTbNXBDLV/ektZez3FNvZVoi/P3E3DynmmaYQW/fM1YGlJOtAbqO9ZN3qTBq6WC5Lqze92ZffS7/feNqymgqNJQ16qNky2qw9t/a3j1IxHuXXdea4tHfsw/3oWvX9HQo0BpA3Z9lOzYpf3f5Ks7Xc85TlGeHOX/f7dr/8vvqe8ujx/oGWPF1cvqcQ1+5u6K/rL3fjp/HRg12oVUA/zD1ZGfaw7iWztt7FtDpeoOrVDhSB5sP5jY5gLxZUyq9jhgyIeSqYv2HpIcIEfbJZpisBbNNu6dX8yuk0W3RjjErZZxrlZVlnce2R2WTB6qKNY5k1fAJ1bsRoK320TlnCFM75dTfSMz6pu0DGbFDEahiJUZggN9uR3y8aVBUU22iooOhtRVLwlCkgPMXrLqVfEqTod4kC7yPk5T4gW7TqHfirCrwjtgQeD/7glOF/cFTXa0A0K5fMPrQqAfKWt+IPpgOpHqO8LvxjhflT3o2OThHDkOWLXM4opxwabIcgaH17ZXprAy6/XP8Y/vfx17/wYrCbHwUVrovS49GlFs6/+c0vNQu8tQPUlSOovfnuNxJat87b0sNjWMc8ppuGXWwWk6awgEYBfMGwfhtLT/d/VYLi3D28FhVAqy3DnbK8CJuuy8N+yYtQXZdPvyIuyfkgxcWlF7PLOy778TVwsPqOjT08dqSsylgtLaY9YrKK+Z6JleviJJmuAvaS+++vrb3JehQjQWkV1rOw45mcYHH0HZfOOGBiQHiouA/I9tZ11d662K8kLbG0RqfOXrZ5mrOZrEhUWSp7TWe3mM0taxcvUynnOPafOi523Tr7TxzLtHZOtW9YbNDlDInyHaj2r/WBbfHd7kYYateXiW/K7DrDtW25hZYTXPGY7XP1ijvSv3oqhlUqKkSEdWsU9nbM7JXi7f4akr8iAd+eVzeJQq2Ogs5dvZTW9hYLn4tL5XWI/ePJmm1dyrDu2MFhq29t10QYoCbCaE1a2r13syVkHxCh+2oL9a4EOnbFUFUXfzhnsv2xJEg/RkEgvjSqR6CGoLqeMLTrviPkMJimh8W/ZkjHuvgHF8b1/wA=</diagram></mxfile>
-103 KB
Binary file not shown.

docs/pandas_to_redshift/pandas-to-redshift-flow.xml

Lines changed: 0 additions & 2 deletions
This file was deleted.

0 commit comments

Comments
 (0)