Improve postgres/redshift read. #427 #431

igorborgest · igorborgest · commit a70e5aa45109 · 2020-10-28T21:05:57.000-03:00
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 > An [AWS Professional Service](https://aws.amazon.com/professional-services/) open source initiative | aws-proserve-opensource@amazon.com
 
-[![Release](https://img.shields.io/badge/release-1.9.6-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Release](https://img.shields.io/badge/release-1.10.0-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py
@@ -7,5 +7,5 @@
 
 __title__: str = "awswrangler"
 __description__: str = "Pandas on AWS."
-__version__: str = "1.9.6"
+__version__: str = "1.10.0"
 __license__: str = "Apache License 2.0"
diff --git a/awswrangler/db.py b/awswrangler/db.py
@@ -187,17 +187,22 @@ def _records2df(
     records: List[Tuple[Any]],
     cols_names: List[str],
     index: Optional[Union[str, List[str]]],
-    dtype: Optional[Dict[str, pa.DataType]] = None,
+    safe: bool,
+    dtype: Optional[Dict[str, pa.DataType]],
 ) -> pd.DataFrame:
     arrays: List[pa.Array] = []
     for col_values, col_name in zip(tuple(zip(*records)), cols_names):  # Transposing
         if (dtype is None) or (col_name not in dtype):
             try:
-                array: pa.Array = pa.array(obj=col_values, safe=True)  # Creating Arrow array
+                array: pa.Array = pa.array(obj=col_values, safe=safe)  # Creating Arrow array
             except pa.ArrowInvalid as ex:
                 array = _data_types.process_not_inferred_array(ex, values=col_values)  # Creating Arrow array
         else:
-            array = pa.array(obj=col_values, type=dtype[col_name], safe=True)  # Creating Arrow array with dtype
+            try:
+                array = pa.array(obj=col_values, type=dtype[col_name], safe=safe)  # Creating Arrow array with dtype
+            except pa.ArrowInvalid:
+                array = pa.array(obj=col_values, safe=safe)  # Creating Arrow array
+                array = array.cast(target_type=dtype[col_name], safe=safe)  # Casting
         arrays.append(array)
     table = pa.Table.from_arrays(arrays=arrays, names=cols_names)  # Creating arrow Table
     df: pd.DataFrame = table.to_pandas(  # Creating Pandas DataFrame
@@ -207,6 +212,7 @@ def _records2df(
         integer_object_nulls=False,
         date_as_object=True,
         types_mapper=_data_types.pyarrow2pandas_extension,
+        safe=safe,
     )
     if index is not None:
         df.set_index(index, inplace=True)
@@ -218,13 +224,14 @@ def _iterate_cursor(
     chunksize: int,
     cols_names: List[str],
     index: Optional[Union[str, List[str]]],
-    dtype: Optional[Dict[str, pa.DataType]] = None,
+    safe: bool,
+    dtype: Optional[Dict[str, pa.DataType]],
 ) -> Iterator[pd.DataFrame]:
     while True:
         records = cursor.fetchmany(chunksize)
         if not records:
             break
-        yield _records2df(records=records, cols_names=cols_names, index=index, dtype=dtype)
+        yield _records2df(records=records, cols_names=cols_names, index=index, safe=safe, dtype=dtype)
 
 
 def _convert_params(sql: str, params: Optional[Union[List[Any], Tuple[Any, ...], Dict[Any, Any]]]) -> List[Any]:
@@ -366,6 +373,7 @@ def read_sql_query(
     params: Optional[Union[List[Any], Tuple[Any, ...], Dict[Any, Any]]] = None,
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
+    safe: bool = True,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding to the result set of the query string.
 
@@ -395,6 +403,8 @@ def read_sql_query(
     dtype : Dict[str, pyarrow.DataType], optional
         Specifying the datatype for columns.
         The keys should be the column names and the values should be the PyArrow types.
+    safe : bool
+        Check for overflows or other unsafe data type conversions.
 
     Returns
     -------
@@ -425,9 +435,11 @@ def read_sql_query(
         args = _convert_params(sql, params)
         cursor = _con.execute(*args)
         if chunksize is None:
-            return _records2df(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype)
+            return _records2df(
+                records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype, safe=safe
+            )
         return _iterate_cursor(
-            cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype
+            cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype, safe=safe
         )
 
 
@@ -439,6 +451,7 @@ def read_sql_table(
     params: Optional[Union[List[Any], Tuple[Any, ...], Dict[Any, Any]]] = None,
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
+    safe: bool = True,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding to the result set of the query string.
 
@@ -471,6 +484,8 @@ def read_sql_table(
     dtype : Dict[str, pyarrow.DataType], optional
         Specifying the datatype for columns.
         The keys should be the column names and the values should be the PyArrow types.
+    safe : bool
+        Check for overflows or other unsafe data type conversions.
 
     Returns
     -------
@@ -502,7 +517,9 @@ def read_sql_table(
         sql: str = f"SELECT * FROM {table}"
     else:
         sql = f"SELECT * FROM {schema}.{table}"
-    return read_sql_query(sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype)
+    return read_sql_query(
+        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+    )
 
 
 def get_redshift_temp_engine(
diff --git a/docs/source/install.rst b/docs/source/install.rst
@@ -59,7 +59,7 @@ Go to your Glue PySpark job and create a new *Job parameters* key/value:
 
 To install a specific version, set the value for above Job parameter as follows:
 
-* Value: ``awswrangler==1.9.6``
+* Value: ``awswrangler==1.10.0``
 
 `Official Glue PySpark Reference <https://docs.aws.amazon.com/glue/latest/dg/reduced-start-times-spark-etl-jobs.html#reduced-start-times-new-features>`_
 
diff --git a/tests/test_db.py b/tests/test_db.py
@@ -704,3 +704,34 @@ def test_redshift_copy_extras(path, redshift_table, databases_parameters, use_th
     assert df.int16.sum() * num == df2.int16.sum()
     assert df.int32.sum() * num == df2.int32.sum()
     assert df.int64.sum() * num == df2.int64.sum()
+
+
+def test_redshift_decimal_cast(redshift_table):
+    df = pd.DataFrame(
+        {
+            "col0": [Decimal((0, (1, 9, 9), -2)), None, Decimal((0, (1, 9, 0), -2))],
+            "col1": [Decimal((0, (1, 9, 9), -2)), None, Decimal((0, (1, 9, 0), -2))],
+            "col2": [Decimal((0, (1, 9, 9), -2)), None, Decimal((0, (1, 9, 0), -2))],
+        }
+    )
+    engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift")
+    wr.db.to_sql(df, engine, name=redshift_table)
+    df2 = wr.db.read_sql_table(
+        schema="public", table=redshift_table, con=engine, dtype={"col0": "float32", "col1": "float64", "col2": "Int64"}
+    )
+    assert df2.dtypes.to_list() == ["float32", "float64", "Int64"]
+    assert 3.88 <= df2.col0.sum() <= 3.89
+    assert 3.88 <= df2.col1.sum() <= 3.89
+    assert df2.col2.sum() == 2
+
+
+def test_postgresql_out_of_bound():
+    engine = wr.catalog.get_engine(connection="aws-data-wrangler-postgresql")
+    sql = """
+    SELECT TO_TIMESTAMP(
+        '9999-12-31 9:30:20',
+        'YYYY-MM-DD HH:MI:SS'
+    )::timestamp without time zone;
+    """
+    df = wr.db.read_sql_query(sql=sql, con=engine, safe=False)
+    assert df.shape == (1, 1)
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -2,7 +2,7 @@
 
 
 def test_metadata():
-    assert wr.__version__ == "1.9.6"
+    assert wr.__version__ == "1.10.0"
     assert wr.__title__ == "awswrangler"
     assert wr.__description__ == "Pandas on AWS."
     assert wr.__license__ == "Apache License 2.0"