Skip to content

Commit 60ee9ae

Browse files
authored
Merge pull request #116 from awslabs/range-index
Removing regular indexes from the compulsory Int64 cast
2 parents 5d9c525 + 4ab4e2d commit 60ee9ae

File tree

2 files changed

+15
-1
lines changed

2 files changed

+15
-1
lines changed

awswrangler/pandas.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1519,9 +1519,10 @@ def _read_parquet_path(session_primitives: "SessionPrimitives",
15191519
fs.invalidate_cache()
15201520
table = pq.read_table(source=path, columns=columns, filters=filters, filesystem=fs, use_threads=use_threads)
15211521
# Check if we lose some integer during the conversion (Happens when has some null value)
1522-
integers = [field.name for field in table.schema if str(field.type).startswith("int")]
1522+
integers = [field.name for field in table.schema if str(field.type).startswith("int") and field.name != "__index_level_0__"]
15231523
logger.debug(f"Converting to Pandas: {path}")
15241524
df = table.to_pandas(use_threads=use_threads, integer_object_nulls=True)
1525+
logger.debug(f"Casting Int64 columns: {path}")
15251526
for c in integers:
15261527
if not str(df[c].dtype).startswith("int"):
15271528
df[c] = df[c].astype("Int64")

testing/test_awswrangler/test_pandas.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2186,3 +2186,16 @@ def test_to_parquet_categorical_partitions(bucket):
21862186
wr.pandas.to_parquet(x[x.Year == 1990], path=path, partition_cols=["Year"])
21872187
y = wr.pandas.read_parquet(path=path)
21882188
assert len(x[x.Year == 1990].index) == len(y.index)
2189+
2190+
2191+
def test_range_index(bucket, database):
2192+
path = f"s3://{bucket}/test_range_index"
2193+
wr.s3.delete_objects(path=path)
2194+
d = pd.date_range('1990-01-01', freq='D', periods=10000)
2195+
vals = pd.np.random.randn(len(d), 4)
2196+
x = pd.DataFrame(vals, index=d, columns=['A', 'B', 'C', 'D']).reset_index()
2197+
print(x)
2198+
wr.pandas.to_parquet(dataframe=x, path=path, database=database)
2199+
df = wr.pandas.read_parquet(path=path)
2200+
assert len(x.columns) == len(df.columns)
2201+
assert len(x.index) == len(df.index)

0 commit comments

Comments
 (0)