Skip to content

Commit 8e0f23a

Browse files
authored
Merge pull request #129 from awslabs/compression
Fix read_csv for compressed files
2 parents 5f7aef7 + 60adaae commit 8e0f23a

File tree

3 files changed

+23
-1
lines changed

3 files changed

+23
-1
lines changed

awswrangler/pandas.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pyarrow import parquet as pq # type: ignore
1717
import tenacity # type: ignore
1818
from s3fs import S3FileSystem # type: ignore
19+
from pandas.io.common import infer_compression # type: ignore
1920

2021
from awswrangler import data_types
2122
from awswrangler import utils
@@ -94,6 +95,13 @@ def _read_csv_iterator(self, bucket_name, key_path, max_result_size=200_000_000,
9495
:param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
9596
:return: Iterator of Pandas Dataframes
9697
"""
98+
99+
if pd_additional_kwargs.get('compression', 'infer') == 'infer':
100+
pd_additional_kwargs['compression'] = infer_compression(key_path, compression='infer')
101+
102+
if pd_additional_kwargs['compression'] is not None:
103+
raise InvalidParameters("max_result_size currently does not support compressed files")
104+
97105
metadata = S3.head_object_with_retry(client_s3=self._client_s3, bucket=bucket_name, key=key_path)
98106
total_size = metadata["ContentLength"]
99107
logger.debug(f"total_size: {total_size}")
@@ -243,7 +251,11 @@ def _read_csv_once(session_primitives: "SessionPrimitives", bucket_name: str, ke
243251
session: Session = session_primitives.session
244252
client_s3 = session.boto3_session.client(service_name="s3", use_ssl=True, config=session.botocore_config)
245253
client_s3.download_fileobj(Bucket=bucket_name, Key=key_path, Fileobj=buff)
246-
buff.seek(0),
254+
buff.seek(0)
255+
256+
if pd_additional_kwargs.get('compression', 'infer') == 'infer':
257+
pd_additional_kwargs['compression'] = infer_compression(key_path, compression='infer')
258+
247259
dataframe = pd.read_csv(buff, **pd_additional_kwargs)
248260
buff.close()
249261
return dataframe

data_samples/small.csv.gz

1.27 KB
Binary file not shown.

testing/test_awswrangler/test_pandas.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,16 @@ def test_read_csv(session, bucket, sample, row_num):
155155
assert len(dataframe.index) == row_num
156156

157157

158+
@pytest.mark.parametrize("sample, row_num", [("data_samples/small.csv.gz", 100)])
159+
def test_read_csv_infer_compression(session, bucket, sample, row_num):
160+
path = f"s3://{bucket}/{sample}"
161+
session.s3.delete_objects(path=f"s3://{bucket}/")
162+
boto3.client("s3").upload_file(sample, bucket, sample)
163+
dataframe = session.pandas.read_csv(path=path)
164+
session.s3.delete_objects(path=path)
165+
assert len(dataframe.index) == row_num
166+
167+
158168
@pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30), ("data_samples/small.csv", 100)])
159169
def test_read_csv_iterator(session, bucket, sample, row_num):
160170
boto3.client("s3").upload_file(sample, bucket, sample)

0 commit comments

Comments
 (0)