Skip to content

GÜL #61689

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,12 @@ As contributors and maintainers to this project, you are expected to abide by pa
<hr>

[Go to Top](#table-of-contents)
Neva Aydın

Heba Walid Awad

Zeynep Genel

Gül Akkoca

Berat Nevcanoğlu
39 changes: 37 additions & 2 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,21 @@ def write(
from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
if index is not None:
from_pandas_kwargs["preserve_index"] = index

#ekleme yaptığım yer.
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
if any(isinstance(dtype,pd.StringDtype) for dtype in df.dtype):
string_dtype={
col:str(dtype.storage)
for col,dtype in df.dtypes.items()
if isinstance(dtype,pd.StringDtype)
}
metadata = table.schema.metadata or{}
for col,storage in string_dtypes.items():
key=f"pandas_string_dtype_{col}".encode()
val= storage.encode()
metadata[key]= val
table= table.replace_schema_metadata(metadata)


if df.attrs:
df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)}
Expand Down Expand Up @@ -254,13 +267,35 @@ def read(
mode="rb",
)
try:
pa_table = self.api.parquet.read_table(
pa_table = self.api.parquet.read_table(
path_or_handle,
columns=columns,
filesystem=filesystem,
filters=filters,
**kwargs,
)

#eklediğim bölüm pandas_string_dtype_* metadata'larını oku
string_dtypes = {}
metadata = pa_table.schema.metadata
if metadata:
for key, value in metadata.items():
if key.startswith(b"pandas_string_dtype_"):
col_name = key.replace(b"pandas_string_dtype_", b"").decode()
string_dtypes[col_name] = value.decode()

# Eklediğim bölüm: types_mapper fonksiyonu
def types_mapper(pa_type):
for field in pa_table.schema:
if field.type == pa_type:
colname = field.name
if colname in string_dtypes:
return pd.StringDtype(storage=string_dtypes[colname])
return None # fallback to default mapper

if to_pandas_kwargs is None:
to_pandas_kwargs = {}
to_pandas_kwargs["types_mapper"] = types_mapper
with catch_warnings():
filterwarnings(
"ignore",
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,26 @@ def test_basic_subset_columns(self, pa, df_full):
expected=df[["string", "int"]],
read_kwargs={"columns": ["string", "int"]},
)
#ekleme yapılan yeni yer***
@pytest.mark.parametrize("string_storage", ["pyarrow", "python"])
def test_parquet_stringdtype_roundtrip(self, tmp_path, pa):
import pandas as pd
from pandas.testing import assert_frame_equal

df = pd.DataFrame({
"a": pd.Series(["x", "y", "z"], dtype=pd.StringDtype(storage=string_storage))
})

file_path = tmp_path / "stringdtype.parquet"
df.to_parquet(file_path, engine="pyarrow")

result = pd.read_parquet(file_path, engine="pyarrow")

expected_dtype = pd.StringDtype(storage=string_storage)
assert result["a"].dtype == expected_dtype, f"Dtype mismatch: got {result['a'].dtype}, expected {expected_dtype}"

assert_frame_equal(result, df)


def test_to_bytes_without_path_or_buf_provided(self, pa, df_full):
# GH 37105
Expand Down
Loading