|
16 | 16 | from pyarrow import parquet as pq # type: ignore |
17 | 17 | import tenacity # type: ignore |
18 | 18 | from s3fs import S3FileSystem # type: ignore |
| 19 | +from pandas.io.common import infer_compression # type: ignore |
19 | 20 |
|
20 | 21 | from awswrangler import data_types |
21 | 22 | from awswrangler import utils |
@@ -94,6 +95,13 @@ def _read_csv_iterator(self, bucket_name, key_path, max_result_size=200_000_000, |
94 | 95 | :param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv |
95 | 96 | :return: Iterator of Pandas Dataframes |
96 | 97 | """ |
| 98 | + |
| 99 | + if pd_additional_kwargs.get('compression', 'infer') == 'infer': |
| 100 | + pd_additional_kwargs['compression'] = infer_compression(key_path, compression='infer') |
| 101 | + |
| 102 | + if pd_additional_kwargs['compression'] is not None: |
| 103 | + raise InvalidParameters("max_result_size currently does not support compressed files") |
| 104 | + |
97 | 105 | metadata = S3.head_object_with_retry(client_s3=self._client_s3, bucket=bucket_name, key=key_path) |
98 | 106 | total_size = metadata["ContentLength"] |
99 | 107 | logger.debug(f"total_size: {total_size}") |
@@ -243,7 +251,11 @@ def _read_csv_once(session_primitives: "SessionPrimitives", bucket_name: str, ke |
243 | 251 | session: Session = session_primitives.session |
244 | 252 | client_s3 = session.boto3_session.client(service_name="s3", use_ssl=True, config=session.botocore_config) |
245 | 253 | client_s3.download_fileobj(Bucket=bucket_name, Key=key_path, Fileobj=buff) |
246 | | - buff.seek(0), |
| 254 | + buff.seek(0) |
| 255 | + |
| 256 | + if pd_additional_kwargs.get('compression', 'infer') == 'infer': |
| 257 | + pd_additional_kwargs['compression'] = infer_compression(key_path, compression='infer') |
| 258 | + |
247 | 259 | dataframe = pd.read_csv(buff, **pd_additional_kwargs) |
248 | 260 | buff.close() |
249 | 261 | return dataframe |
|
0 commit comments