diff --git a/README.md b/README.md index ebab2e6016850..45a400e423b5e 100644 --- a/README.md +++ b/README.md @@ -188,3 +188,12 @@ As contributors and maintainers to this project, you are expected to abide by pa
[Go to Top](#table-of-contents) +Neva Aydın + +Heba Walid Awad + +Zeynep Genel + +Gül Akkoca + +Berat Nevcanoğlu diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6a5a83088e986..1803c06d564e3 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -184,8 +184,21 @@ def write( from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)} if index is not None: from_pandas_kwargs["preserve_index"] = index - +#ekleme yaptığım yer. table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + if any(isinstance(dtype,pd.StringDtype) for dtype in df.dtype): + string_dtype={ + col:str(dtype.storage) + for col,dtype in df.dtypes.items() + if isinstance(dtype,pd.StringDtype) + } + metadata = table.schema.metadata or{} + for col,storage in string_dtypes.items(): + key=f"pandas_string_dtype_{col}".encode() + val= storage.encode() + metadata[key]= val + table= table.replace_schema_metadata(metadata) + if df.attrs: df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)} @@ -254,13 +267,35 @@ def read( mode="rb", ) try: - pa_table = self.api.parquet.read_table( + pa_table = self.api.parquet.read_table( path_or_handle, columns=columns, filesystem=filesystem, filters=filters, **kwargs, ) + + #eklediğim bölüm pandas_string_dtype_* metadata'larını oku + string_dtypes = {} + metadata = pa_table.schema.metadata + if metadata: + for key, value in metadata.items(): + if key.startswith(b"pandas_string_dtype_"): + col_name = key.replace(b"pandas_string_dtype_", b"").decode() + string_dtypes[col_name] = value.decode() + + # Eklediğim bölüm: types_mapper fonksiyonu + def types_mapper(pa_type): + for field in pa_table.schema: + if field.type == pa_type: + colname = field.name + if colname in string_dtypes: + return pd.StringDtype(storage=string_dtypes[colname]) + return None # fallback to default mapper + + if to_pandas_kwargs is None: + to_pandas_kwargs = {} + to_pandas_kwargs["types_mapper"] = types_mapper with catch_warnings(): filterwarnings( "ignore", diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 78f39b649cb9a..ce77bfa6f36d9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -718,6 +718,26 @@ def test_basic_subset_columns(self, pa, df_full): expected=df[["string", "int"]], read_kwargs={"columns": ["string", "int"]}, ) + #ekleme yapılan yeni yer*** + @pytest.mark.parametrize("string_storage", ["pyarrow", "python"]) + def test_parquet_stringdtype_roundtrip(self, tmp_path, pa): + import pandas as pd + from pandas.testing import assert_frame_equal + + df = pd.DataFrame({ + "a": pd.Series(["x", "y", "z"], dtype=pd.StringDtype(storage=string_storage)) + }) + + file_path = tmp_path / "stringdtype.parquet" + df.to_parquet(file_path, engine="pyarrow") + + result = pd.read_parquet(file_path, engine="pyarrow") + + expected_dtype = pd.StringDtype(storage=string_storage) + assert result["a"].dtype == expected_dtype, f"Dtype mismatch: got {result['a'].dtype}, expected {expected_dtype}" + + assert_frame_equal(result, df) + def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): # GH 37105