|
1 | | -from typing import Union |
| 1 | +from typing import Union, List, Dict, Tuple, Any |
2 | 2 | import logging |
| 3 | +import json |
3 | 4 |
|
4 | 5 | import pg8000 # type: ignore |
5 | 6 | import pymysql # type: ignore |
| 7 | +import pandas as pd # type: ignore |
6 | 8 |
|
7 | | -from awswrangler.exceptions import InvalidEngine |
| 9 | +from awswrangler import data_types |
| 10 | +from awswrangler.exceptions import InvalidEngine, InvalidDataframeType, AuroraLoadError |
8 | 11 |
|
9 | 12 | logger = logging.getLogger(__name__) |
10 | 13 |
|
11 | 14 |
|
12 | 15 | class Aurora: |
13 | 16 | def __init__(self, session): |
14 | 17 | self._session = session |
| 18 | + self._client_s3 = session.boto3_session.client(service_name="s3", use_ssl=True, config=session.botocore_config) |
15 | 19 |
|
16 | 20 | @staticmethod |
17 | 21 | def _validate_connection(database: str, |
@@ -101,3 +105,163 @@ def generate_connection(database: str, |
101 | 105 | else: |
102 | 106 | raise InvalidEngine(f"{engine} is not a valid engine. Please use 'mysql' or 'postgres'!") |
103 | 107 | return conn |
| 108 | + |
| 109 | + def write_load_manifest(self, manifest_path: str, |
| 110 | + objects_paths: List[str]) -> Dict[str, List[Dict[str, Union[str, bool]]]]: |
| 111 | + manifest: Dict[str, List[Dict[str, Union[str, bool]]]] = {"entries": []} |
| 112 | + path: str |
| 113 | + for path in objects_paths: |
| 114 | + entry: Dict[str, Union[str, bool]] = {"url": path, "mandatory": True} |
| 115 | + manifest["entries"].append(entry) |
| 116 | + payload: str = json.dumps(manifest) |
| 117 | + bucket: str |
| 118 | + bucket, path = manifest_path.replace("s3://", "").split("/", 1) |
| 119 | + logger.info(f"payload: {payload}") |
| 120 | + self._client_s3.put_object(Body=payload, Bucket=bucket, Key=path) |
| 121 | + return manifest |
| 122 | + |
| 123 | + @staticmethod |
| 124 | + def load_table(dataframe: pd.DataFrame, |
| 125 | + dataframe_type: str, |
| 126 | + load_paths: List[str], |
| 127 | + schema_name: str, |
| 128 | + table_name: str, |
| 129 | + connection: Any, |
| 130 | + num_files, |
| 131 | + mode: str = "append", |
| 132 | + preserve_index: bool = False, |
| 133 | + engine: str = "mysql", |
| 134 | + region: str = "us-east-1"): |
| 135 | + """ |
| 136 | + Load text/CSV files into a Aurora table using a manifest file. |
| 137 | + Creates the table if necessary. |
| 138 | +
|
| 139 | + :param dataframe: Pandas or Spark Dataframe |
| 140 | + :param dataframe_type: "pandas" or "spark" |
| 141 | + :param load_paths: S3 paths to be loaded (E.g. S3://...) |
| 142 | + :param schema_name: Aurora schema |
| 143 | + :param table_name: Aurora table name |
| 144 | + :param connection: A PEP 249 compatible connection (Can be generated with Aurora.generate_connection()) |
| 145 | + :param num_files: Number of files to be loaded |
| 146 | + :param mode: append or overwrite |
| 147 | + :param preserve_index: Should we preserve the Dataframe index? (ONLY for Pandas Dataframe) |
| 148 | + :param engine: "mysql" or "postgres" |
| 149 | + :param region: AWS S3 bucket region (Required only for postgres engine) |
| 150 | + :return: None |
| 151 | + """ |
| 152 | + with connection.cursor() as cursor: |
| 153 | + if mode == "overwrite": |
| 154 | + Aurora._create_table(cursor=cursor, |
| 155 | + dataframe=dataframe, |
| 156 | + dataframe_type=dataframe_type, |
| 157 | + schema_name=schema_name, |
| 158 | + table_name=table_name, |
| 159 | + preserve_index=preserve_index, |
| 160 | + engine=engine) |
| 161 | + |
| 162 | + for path in load_paths: |
| 163 | + sql = Aurora._get_load_sql(path=path, |
| 164 | + schema_name=schema_name, |
| 165 | + table_name=table_name, |
| 166 | + engine=engine, |
| 167 | + region=region) |
| 168 | + logger.debug(sql) |
| 169 | + cursor.execute(sql) |
| 170 | + |
| 171 | + if "mysql" in engine.lower(): |
| 172 | + sql = ("-- AWS DATA WRANGLER\n" |
| 173 | + f"SELECT COUNT(*) as num_files_loaded FROM mysql.aurora_s3_load_history " |
| 174 | + f"WHERE load_prefix = '{path}'") |
| 175 | + logger.debug(sql) |
| 176 | + cursor.execute(sql) |
| 177 | + num_files_loaded = cursor.fetchall()[0][0] |
| 178 | + if num_files_loaded != (num_files + 1): |
| 179 | + connection.rollback() |
| 180 | + raise AuroraLoadError( |
| 181 | + f"Aurora load rolled back. {num_files_loaded} files counted. {num_files} expected.") |
| 182 | + |
| 183 | + connection.commit() |
| 184 | + logger.debug("Load committed.") |
| 185 | + |
| 186 | + @staticmethod |
| 187 | + def _parse_path(path): |
| 188 | + path2 = path.replace("s3://", "") |
| 189 | + parts = path2.partition("/") |
| 190 | + return parts[0], parts[2] |
| 191 | + |
| 192 | + @staticmethod |
| 193 | + def _get_load_sql(path: str, schema_name: str, table_name: str, engine: str, region: str = "us-east-1") -> str: |
| 194 | + if "postgres" in engine.lower(): |
| 195 | + bucket, key = Aurora._parse_path(path=path) |
| 196 | + sql: str = ("-- AWS DATA WRANGLER\n" |
| 197 | + "SELECT aws_s3.table_import_from_s3(\n" |
| 198 | + f"'{schema_name}.{table_name}',\n" |
| 199 | + "'',\n" |
| 200 | + "'(FORMAT CSV, DELIMITER '','', QUOTE ''\"'', ESCAPE ''\\'')',\n" |
| 201 | + f"'({bucket},{key},{region})')") |
| 202 | + elif "mysql" in engine.lower(): |
| 203 | + sql = ("-- AWS DATA WRANGLER\n" |
| 204 | + "SELECT aws_s3.table_import_from_s3(\n" |
| 205 | + f"LOAD DATA FROM S3 MANIFEST '{path}'\n" |
| 206 | + "REPLACE\n" |
| 207 | + f"INTO TABLE {schema_name}.{table_name}\n" |
| 208 | + "FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ESCAPED BY '\\\\'\n" |
| 209 | + "LINES TERMINATED BY '\\n'") |
| 210 | + else: |
| 211 | + raise InvalidEngine(f"{engine} is not a valid engine. Please use 'mysql' or 'postgres'!") |
| 212 | + return sql |
| 213 | + |
| 214 | + @staticmethod |
| 215 | + def _create_table(cursor, |
| 216 | + dataframe, |
| 217 | + dataframe_type, |
| 218 | + schema_name, |
| 219 | + table_name, |
| 220 | + preserve_index=False, |
| 221 | + engine: str = "mysql"): |
| 222 | + """ |
| 223 | + Creates Aurora table. |
| 224 | +
|
| 225 | + :param cursor: A PEP 249 compatible cursor |
| 226 | + :param dataframe: Pandas or Spark Dataframe |
| 227 | + :param dataframe_type: "pandas" or "spark" |
| 228 | + :param schema_name: Redshift schema |
| 229 | + :param table_name: Redshift table name |
| 230 | + :param preserve_index: Should we preserve the Dataframe index? (ONLY for Pandas Dataframe) |
| 231 | + :param engine: "mysql" or "postgres" |
| 232 | + :return: None |
| 233 | + """ |
| 234 | + sql: str = f"-- AWS DATA WRANGLER\n" \ |
| 235 | + f"DROP TABLE IF EXISTS {schema_name}.{table_name}" |
| 236 | + logger.debug(f"Drop table query:\n{sql}") |
| 237 | + cursor.execute(sql) |
| 238 | + schema = Aurora._get_schema(dataframe=dataframe, |
| 239 | + dataframe_type=dataframe_type, |
| 240 | + preserve_index=preserve_index, |
| 241 | + engine=engine) |
| 242 | + cols_str: str = "".join([f"{col[0]} {col[1]},\n" for col in schema])[:-2] |
| 243 | + sql = (f"-- AWS DATA WRANGLER\n" f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (\n" f"{cols_str})") |
| 244 | + logger.debug(f"Create table query:\n{sql}") |
| 245 | + cursor.execute(sql) |
| 246 | + |
| 247 | + @staticmethod |
| 248 | + def _get_schema(dataframe, |
| 249 | + dataframe_type: str, |
| 250 | + preserve_index: bool, |
| 251 | + engine: str = "mysql") -> List[Tuple[str, str]]: |
| 252 | + schema_built: List[Tuple[str, str]] = [] |
| 253 | + if "postgres" in engine.lower(): |
| 254 | + convert_func = data_types.pyarrow2postgres |
| 255 | + elif "mysql" in engine.lower(): |
| 256 | + convert_func = data_types.pyarrow2mysql |
| 257 | + else: |
| 258 | + raise InvalidEngine(f"{engine} is not a valid engine. Please use 'mysql' or 'postgres'!") |
| 259 | + if dataframe_type.lower() == "pandas": |
| 260 | + pyarrow_schema: List[Tuple[str, str]] = data_types.extract_pyarrow_schema_from_pandas( |
| 261 | + dataframe=dataframe, preserve_index=preserve_index, indexes_position="right") |
| 262 | + for name, dtype in pyarrow_schema: |
| 263 | + aurora_type: str = convert_func(dtype) |
| 264 | + schema_built.append((name, aurora_type)) |
| 265 | + else: |
| 266 | + raise InvalidDataframeType(f"{dataframe_type} is not a valid DataFrame type. Please use 'pandas'!") |
| 267 | + return schema_built |
0 commit comments