From 6e3932b061d2da2de0cd44e7fc1547d481fdb83d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Jan 2025 17:47:14 +0100 Subject: [PATCH 1/5] GH-45175: [Python] Honor the strings_to_categorical keyword in to_pandas for string view type (#45176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Currently this keyword works for string or large string: ```python >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col category dtype: object >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.large_string())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col category dtype: object ``` but not for string view: ```python >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string_view())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col object dtype: object ``` For consistency we should make that keyword check for string view columns as well, I think From https://github.com/apache/arrow/pull/44195/files#r1901831460 ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, when using the `strings_to_categorical=True` keyword and having a string_view type, this column will now be converted to a pandas Categorical * GitHub Issue: #45175 Authored-by: Joris Van den Bossche Signed-off-by: Raúl Cumplido --- .../src/arrow/python/arrow_to_pandas.cc | 6 ++-- python/pyarrow/tests/test_pandas.py | 32 ++++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 10c4d0e1600..a0f1d5bbbed 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -2523,7 +2523,8 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr } if (options.strings_to_categorical) { for (int i = 0; i < static_cast(arrays->size()); i++) { - if (is_base_binary_like((*arrays)[i]->type()->id())) { + if (is_base_binary_like((*arrays)[i]->type()->id()) || + is_binary_view_like((*arrays)[i]->type()->id())) { columns_to_encode.push_back(i); } } @@ -2557,7 +2558,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options, py_ref = nullptr; } - if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) { + if (options.strings_to_categorical && (is_base_binary_like(arr->type()->id()) || + is_binary_view_like(arr->type()->id()))) { if (options.zero_copy_only) { return Status::Invalid("Need to dictionary encode a column, but ", "only zero-copy conversions allowed"); diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 1186f87b032..d5c936df072 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1836,10 +1836,13 @@ def test_to_pandas_categories_already_dictionary(self): result = table.to_pandas(categories=['col']) assert table.to_pandas().equals(result) - def test_table_str_to_categorical_without_na(self): + @pytest.mark.parametrize( + "string_type", [pa.string(), pa.large_string(), pa.string_view()] + ) + def test_table_str_to_categorical_without_na(self, string_type): values = ['a', 'a', 'b', 'b', 'c'] df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) + field = pa.field('strings', string_type) schema = pa.schema([field]) table = pa.Table.from_pandas(df, schema=schema) @@ -1851,10 +1854,22 @@ def test_table_str_to_categorical_without_na(self): table.to_pandas(strings_to_categorical=True, zero_copy_only=True) - def test_table_str_to_categorical_with_na(self): + # chunked array + result = table["strings"].to_pandas(strings_to_categorical=True) + expected = pd.Series(pd.Categorical(values), name="strings") + tm.assert_series_equal(result, expected) + + with pytest.raises(pa.ArrowInvalid): + table["strings"].to_pandas(strings_to_categorical=True, + zero_copy_only=True) + + @pytest.mark.parametrize( + "string_type", [pa.string(), pa.large_string(), pa.string_view()] + ) + def test_table_str_to_categorical_with_na(self, string_type): values = [None, 'a', 'b', np.nan] df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) + field = pa.field('strings', string_type) schema = pa.schema([field]) table = pa.Table.from_pandas(df, schema=schema) @@ -1866,6 +1881,15 @@ def test_table_str_to_categorical_with_na(self): table.to_pandas(strings_to_categorical=True, zero_copy_only=True) + # chunked array + result = table["strings"].to_pandas(strings_to_categorical=True) + expected = pd.Series(pd.Categorical(values), name="strings") + tm.assert_series_equal(result, expected) + + with pytest.raises(pa.ArrowInvalid): + table["strings"].to_pandas(strings_to_categorical=True, + zero_copy_only=True) + # Regression test for ARROW-2101 def test_array_of_bytes_to_strings(self): converted = pa.array(np.array([b'x'], dtype=object), pa.string()) From 5ce627cd89868ec84065f212c884727c71fc764e Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 7 Jan 2025 11:18:47 -0800 Subject: [PATCH 2/5] GH-45140: [Dev][Release] Release guide improvements (#45141) ### What changes are included in this PR? Updates to the release guide. Mostly changes to make the guide up to date with how release are being done currently. ### Are these changes tested? Previewed locally. ### Are there any user-facing changes? More accurate docs. Fixes #45140 * GitHub Issue: #45140 Lead-authored-by: Bryce Mecum Co-authored-by: Sutou Kouhei Signed-off-by: Bryce Mecum --- docs/source/developers/release.rst | 127 ++++++++++++++++++----------- 1 file changed, 80 insertions(+), 47 deletions(-) diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 52f4a751dcc..605a1adbe10 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -24,7 +24,10 @@ Release Management Guide This page provides detailed information on the steps followed to perform a release. It can be used both as a guide to learn the Apache Arrow release process and as a comprehensive checklist for the Release Manager when -performing a release. +performing a release. The person acting as Release Manager must at least have +committer status in order to perform the tasks below. If the Release Manager is +a committer but not a member of the PMC, some tasks will need to be delegated +to a PMC member and these are marked below accordingly. Principles ========== @@ -36,8 +39,15 @@ Preparing for the release ========================= Before creating a source release, the Release Manager must ensure that any -resolved JIRAs have the appropriate Fix Version set so that the changelog is -generated properly. +resolved GitHub issues have the appropriate milestone set so that the changelog +is generated properly. + +Note that pull requests without a corresponding GitHub issue won't be detected +by the cherry-pick script and must be cherry-picked manually by the release +manager onto the maintenance branch. Examples include MINOR and Dependabot pull +requests. For this reason, it's encouraged to avoid the need for manual +cherry-picking by creating issues for any pull requests that are merged to the +default branch after the release maintenance branch has been created. .. dropdown:: Requirements :animate: fade-in-slide-down @@ -67,7 +77,8 @@ generated properly. Before creating a Release Candidate =================================== -Ensure local tags are removed, gpg-agent is set and JIRA tickets are correctly assigned. +Ensure local tags are removed, gpg-agent is set and GitHub issues are correctly +assigned. .. code-block:: @@ -78,7 +89,8 @@ Ensure local tags are removed, gpg-agent is set and JIRA tickets are correctly a source dev/release/setup-gpg-agent.sh # Curate the release - # The end of the generated report shows the JIRA tickets with wrong version number assigned. + # The end of the generated report shows any GitHub issues with the wrong + # version number assigned. archery release curate Ensure a major version milestone for a follow up release is created on GitHub. This will @@ -149,7 +161,7 @@ Create or update the corresponding maintenance branch # This will create a branch locally called maint-X.Y.Z. # X.Y.Z corresponds with the Major, Minor and Patch version number # of the release respectively. As an example 9.0.0 - archery release --jira-cache /tmp/jiracache cherry-pick X.Y.Z --execute + archery release cherry-pick X.Y.Z --execute # Push the maintenance branch to the remote repository git push -u apache maint-X.Y.Z @@ -158,14 +170,30 @@ Create or update the corresponding maintenance branch .. code-block:: # First run in dry-mode to see which commits will be cherry-picked. - # If there are commits that we don't want to get applied ensure the version on - # JIRA is set to the following release. - archery release --jira-cache /tmp/jiracache cherry-pick X.Y.Z --continue + # If there are commits that we don't want to get applied, ensure the + # milestone on GitHub is set to the following release. + archery release cherry-pick X.Y.Z --continue # Update the maintenance branch with the previous commits - archery release --jira-cache /tmp/jiracache cherry-pick X.Y.Z --continue --execute + archery release cherry-pick X.Y.Z --continue --execute # Push the updated maintenance branch to the remote repository git push -u apache maint-X.Y.Z +Optional: Test Before Creating a Release Candidate +-------------------------------------------------- + +Some release managers prefer to perform testing before creating the first +release candidate to avoid the need to create multiple release candidates within +a given release. + +To test before creating a release candiate: + +* Create a pull request from the up-to-date maint-X.Y.Z branch onto main +* Title the pull request "WIP: Dummy PR to check maint-X.Y.Z status" +* Comment on the pull request to trigger the relevant Crossbow jobs: + + * ``@github-actions crossbow submit --group verify-rc-source`` + * ``@github-actions crossbow submit --group packaging`` + Create the Release Candidate branch from the updated maintenance branch ----------------------------------------------------------------------- @@ -178,12 +206,12 @@ Create the Release Candidate branch from the updated maintenance branch # place the necessary commits updating the version number and then create a git tag # on OSX use gnu-sed with homebrew: brew install gnu-sed (and export to $PATH) # - # starts at 0 and increments every time the Release Candidate is burned + # starts at 0 and increments every time the Release Candidate is created # so for the first RC this would be: dev/release/01-prepare.sh 4.0.0 5.0.0 0 dev/release/01-prepare.sh # Push the release candidate tag - git push -u apache apache-arrow-rc + git push -u apache apache-arrow--rc # Push the release candidate branch in order to trigger verification jobs later git push -u apache release--rc @@ -194,6 +222,7 @@ Build source and binaries and submit them # Build the source release tarball and create Pull Request with verification tasks # + # NOTE: This must be run by a PMC member # NOTE: You need to have GitHub CLI installed to run this script. dev/release/02-source.sh @@ -209,13 +238,16 @@ Build source and binaries and submit them # Sign and upload the binaries # + # NOTE: This must be run by a PMC member + # # On macOS the only way I could get this to work was running "echo "UPDATESTARTUPTTY" | gpg-connect-agent" before running this comment # otherwise I got errors referencing "ioctl" errors. dev/release/05-binary-upload.sh # Sign and upload MATLAB artifacts to the GitHub Releases area. # - # Note that you need to have GitHub CLI installed to run this script. + # NOTE: This must be run by a PMC member + # NOTE: You need to have GitHub CLI installed to run this script. dev/release/06-matlab-upload.sh # Start verifications for binaries and wheels @@ -246,8 +278,6 @@ After the release vote, we must undertake many tasks to update source artifacts, Be sure to go through on the following checklist: #. Update the released milestone Date and set to "Closed" on GitHub -#. Make the CPP PARQUET related version as "RELEASED" on JIRA -#. Start the new version on JIRA for the related CPP PARQUET version #. Merge changes on release branch to maintenance branch for patch releases #. Add the new release to the Apache Reporter System #. Push release tag @@ -266,7 +296,6 @@ Be sure to go through on the following checklist: #. Update vcpkg port #. Update Conan recipe #. Bump versions -#. Update tags for Go modules #. Update docs #. Update version in Apache Arrow Cookbook #. Announce the new release @@ -274,28 +303,6 @@ Be sure to go through on the following checklist: #. Announce the release on Twitter #. Remove old artifacts -.. dropdown:: Mark the released version as "RELEASED" on JIRA - :animate: fade-in-slide-down - :class-title: sd-fs-5 - :class-container: sd-shadow-md - - - Open https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/administer-versions - - Click "..." for the release version in "Actions" column - - Select "Release" - - Set "Release date" - - Click "Release" button - -.. dropdown:: Start the new version on JIRA - :animate: fade-in-slide-down - :class-title: sd-fs-5 - :class-container: sd-shadow-md - - - Open https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/administer-versions - - Click "..." for the next version in "Actions" column - - Select "Edit" - - Set "Start date" - - Click "Save" button - .. dropdown:: Merge changes on release branch to maintenance branch for patch releases :animate: fade-in-slide-down :class-title: sd-fs-5 @@ -588,7 +595,7 @@ Be sure to go through on the following checklist: :class-title: sd-fs-5 :class-container: sd-shadow-md - Open a pull request to vcpkg: + Open a pull request to Conan: .. code-block:: Bash @@ -604,8 +611,8 @@ Be sure to go through on the following checklist: git remote add upstream https://github.com/conan-io/conan-center-index.git cd - - # dev/release/post-17-conan.sh 10.0.1 ../conan-center-index - dev/release/post-17-conan.sh X.Y.Z + # dev/release/post-16-conan.sh 10.0.1 ../conan-center-index + dev/release/post-16-conan.sh X.Y.Z This script pushes a ``arrow-X.Y.Z`` branch to your ``conan-io/conan-center-index`` fork. You need to create a pull request from the ``arrow-X.Y.Z`` branch on your Web browser. @@ -627,7 +634,8 @@ Be sure to go through on the following checklist: :class-title: sd-fs-5 :class-container: sd-shadow-md - The documentations are generated in the release process. We just need to upload the generated documentations: + Documentation is generated as part of the release process. We just need to + upload the generated documentation: .. code-block:: Bash @@ -650,7 +658,8 @@ Be sure to go through on the following checklist: :class-title: sd-fs-5 :class-container: sd-shadow-md - TODO + Follow `the documentation `_ + in the Apache Arrow Cookbook repository .. dropdown:: Announce the new release :animate: fade-in-slide-down @@ -666,16 +675,38 @@ Be sure to go through on the following checklist: :class-title: sd-fs-5 :class-container: sd-shadow-md - TODO + The blog post process isn't automated. The rough set of steps we usually take + are: -.. dropdown:: Announce the release on Twitter + * Clone https://github.com/apache/arrow-site. + * Create a new branch off ``main`` for the blog post pull request we're + creating. + * Duplicate a recent blog post entry in the ``_posts`` subfolder and update + the filename and YAML metadata. + + * Set the date in the filename and in the YAML metadata to the date that the + release candidate vote thread for the release closed (in GMT). + + * *For minor releases only*, remove any section about community updates (new + committers, PMC members, etc). + * Update the remainder of the text as needed + * Create the pull request + * In the pull request, ping contributors in each section requesting help + filling in the details for each section. + + +.. dropdown:: Announce the release on social media :animate: fade-in-slide-down :class-title: sd-fs-5 :class-container: sd-shadow-md - Post the release blog post on Twitter from the `@ApacheArrow `_ handle. + Post about the release and link to the blog post on social media. The project + has two official accounts: + + * Twitter/X: `@ApacheArrow `_ + * LinkedIn: https://www.linkedin.com/company/apache-arrow/ - PMC members have access or can request access, after which they can post via `TweetDeck `_. + PMC members have access or can request access to post under these accounts. .. dropdown:: Remove old artifacts :animate: fade-in-slide-down @@ -687,3 +718,5 @@ Be sure to go through on the following checklist: .. code-block:: Bash dev/release/post-09-remove-old-artifacts.sh + + Note: This step must be done by a PMC member. From 18d3341f1cbb6a6733e5ebcf8fb4ed14e53280e4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 8 Jan 2025 14:10:59 +0900 Subject: [PATCH 3/5] GH-45164: [CI][Integration] Follow build script name change in apache/arrow-java (#45199) ### Rationale for this change For apache/arrow-java#493. ### What changes are included in this PR? Remove `java_` prefix. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #45164 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ci/scripts/integration_arrow_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index 47ef34d9a47..1c7e65cf27f 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -60,7 +60,7 @@ if [ "${ARCHERY_INTEGRATION_WITH_JAVA}" -gt "0" ]; then export JAVA_JNI_CMAKE_ARGS="-DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF -DARROW_JAVA_JNI_ENABLE_C=ON" ${arrow_dir}/java/ci/scripts/jni_build.sh "${arrow_dir}/java" "${ARROW_HOME}" "${build_dir}/java/" /tmp/dist/java - ${arrow_dir}/java/ci/scripts/java_build.sh "${arrow_dir}/java" "${build_dir}/java" /tmp/dist/java + ${arrow_dir}/java/ci/scripts/build.sh "${arrow_dir}/java" "${build_dir}/java" /tmp/dist/java fi github_actions_group_end From fde843ac5f91605af5cd3a0ad85a9aa31cceb989 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 9 Jan 2025 22:45:49 +0800 Subject: [PATCH 4/5] GH-45212: [C++][Parquet] Fix uninitialized size_statistics_level property (#45213) ### Rationale for this change The PR to introduce SizeStatistics has spanned several months, during which time WriterProperties::Builder(const WriterProperties& properties) was added. ### What changes are included in this PR? This PR fixes WriterProperties::Builder(const WriterProperties& properties) function to initialize size_statistics_level_. ### Are these changes tested? Pass CIs. ### Are there any user-facing changes? No. * GitHub Issue: #45212 Authored-by: Gang Wu Signed-off-by: Gang Wu --- cpp/src/parquet/properties.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index c9420103968..edaf28cd92a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -271,6 +271,7 @@ class PARQUET_EXPORT WriterProperties { created_by_(properties.created_by()), store_decimal_as_integer_(properties.store_decimal_as_integer()), page_checksum_enabled_(properties.page_checksum_enabled()), + size_statistics_level_(properties.size_statistics_level()), sorting_columns_(properties.sorting_columns()), default_column_properties_(properties.default_column_properties()) {} From fa0b2d77a809ce04c1161335702f142b62c22799 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Jan 2025 20:22:01 +0100 Subject: [PATCH 5/5] GH-43683: [Python] Use pandas StringDtype when enabled (pandas 3+) (#44195) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change With pandas' [PDEP-14](https://pandas.pydata.org/pdeps/0014-string-dtype.html) proposal, pandas is planning to introduce a default string dtype in pandas 3.0 (instead of the current object dtype). This will become the default in pandas 3.0, and can be enabled with an option in the upcoming pandas 2.3 (`pd.options.future.infer_string = True`). To prepare for that, we should start using that string dtype in `to_pandas()` conversions when that option is enabled. ### What changes are included in this PR? - If pandas >= 3.0 is used or the pandas option is enabled, ensure that `to_pandas()` calls use the default string dtype of pandas for string-like columns (string, large_string, string_view) ### Are these changes tested? It is tested in the pandas-nightly crossbow build. There is still one failure that is because of a bug on the pandas side (https://github.com/pandas-dev/pandas/issues/59879) ### Are there any user-facing changes? **This PR includes breaking changes to public APIs.** Depending on the version of pandas, `to_pandas()` will change to use pandas' string dtype instead of object dtype. This is a breaking user-facing change, but essentially just following the equivalent change in default dtype on the pandas side. * GitHub Issue: #43683 Lead-authored-by: Joris Van den Bossche Co-authored-by: Raúl Cumplido Signed-off-by: Joris Van den Bossche --- dev/tasks/tasks.yml | 6 +++ docker-compose.yml | 1 + python/pyarrow/array.pxi | 2 + python/pyarrow/pandas-shim.pxi | 17 +++++- python/pyarrow/pandas_compat.py | 62 +++++++++++++++++++--- python/pyarrow/tests/test_compute.py | 19 +++---- python/pyarrow/tests/test_feather.py | 6 ++- python/pyarrow/tests/test_pandas.py | 77 ++++++++++++++++++++++------ 8 files changed, 155 insertions(+), 35 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 9f04d33f83c..c43df2b6f25 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1430,6 +1430,12 @@ tasks: # ensure we have at least one build with parquet encryption disabled PARQUET_REQUIRE_ENCRYPTION: "OFF" {% endif %} + {% if pandas_version == "nightly" %} + # TODO can be removed once this is enabled by default in pandas >= 3 + # This is to enable the Pandas feature. + # See: https://github.com/pandas-dev/pandas/pull/58459 + PANDAS_FUTURE_INFER_STRING: "1" + {% endif %} {% if not cache_leaf %} # use the latest pandas release, so prevent reusing any cached layers flags: --no-leaf-cache diff --git a/docker-compose.yml b/docker-compose.yml index bd912095633..b70d924da13 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1326,6 +1326,7 @@ services: PYTEST_ARGS: # inherit HYPOTHESIS_PROFILE: # inherit PYARROW_TEST_HYPOTHESIS: # inherit + PANDAS_FUTURE_INFER_STRING: # inherit volumes: *conda-volumes command: *python-conda-command diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f86caf1433d..2ef42051d9a 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -117,6 +117,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size): "return a pyarrow Array or ChunkedArray.") if isinstance(res, ChunkedArray) and res.num_chunks==1: res = res.chunk(0) + if type is not None and res.type != type: + res = res.cast(type) return res diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 74f0d981b52..5be6f03f86e 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype, _lock bint has_sparse bint _pd024 - bint _is_v1, _is_ge_v21, _is_ge_v3 + bint _is_v1, _is_ge_v21, _is_ge_v3, _is_ge_v3_strict def __init__(self): self._lock = Lock() @@ -80,6 +80,7 @@ cdef class _PandasAPIShim(object): self._is_v1 = self._loose_version < Version('2.0.0') self._is_ge_v21 = self._loose_version >= Version('2.1.0') self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0') + self._is_ge_v3_strict = self._loose_version >= Version('3.0.0') self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -174,6 +175,20 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_ge_v3 + def is_ge_v3_strict(self): + self._check_import() + return self._is_ge_v3_strict + + def uses_string_dtype(self): + if self.is_ge_v3_strict(): + return True + try: + if self.pd.options.future.infer_string: + return True + except: + pass + return False + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index d0582f825b5..e9655914ad7 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -174,7 +174,11 @@ def get_column_metadata(column, name, arrow_type, field_name): } string_dtype = 'object' - if name is not None and not isinstance(name, str): + if ( + name is not None + and not (isinstance(name, float) and np.isnan(name)) + and not isinstance(name, str) + ): raise TypeError( 'Column name must be a string. Got column {} of type {}'.format( name, type(name).__name__ @@ -340,8 +344,8 @@ def _column_name_to_strings(name): return str(tuple(map(_column_name_to_strings, name))) elif isinstance(name, Sequence): raise TypeError("Unsupported type for MultiIndex level") - elif name is None: - return None + elif name is None or (isinstance(name, float) and np.isnan(name)): + return name return str(name) @@ -790,10 +794,12 @@ def table_to_dataframe( table, index = _reconstruct_index(table, index_descriptors, all_columns, types_mapper) ext_columns_dtypes = _get_extension_dtypes( - table, all_columns, types_mapper) + table, all_columns, types_mapper, options, categories) else: index = _pandas_api.pd.RangeIndex(table.num_rows) - ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper) + ext_columns_dtypes = _get_extension_dtypes( + table, [], types_mapper, options, categories + ) _check_data_column_metadata_consistency(all_columns) columns = _deserialize_column_index(table, all_columns, column_indexes) @@ -838,7 +844,7 @@ def table_to_dataframe( } -def _get_extension_dtypes(table, columns_metadata, types_mapper=None): +def _get_extension_dtypes(table, columns_metadata, types_mapper, options, categories): """ Based on the stored column pandas metadata and the extension types in the arrow schema, infer which columns should be converted to a @@ -851,6 +857,9 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): and then we can check if this dtype supports conversion from arrow. """ + strings_to_categorical = options["strings_to_categorical"] + categories = categories or [] + ext_columns = {} # older pandas version that does not yet support extension dtypes @@ -889,9 +898,32 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): # that are certainly numpy dtypes pandas_dtype = _pandas_api.pandas_dtype(dtype) if isinstance(pandas_dtype, _pandas_api.extension_dtype): + if isinstance(pandas_dtype, _pandas_api.pd.StringDtype): + # when the metadata indicate to use the string dtype, + # ignore this in case: + # - it is specified to convert strings / this column to categorical + # - the column itself is dictionary encoded and would otherwise be + # converted to categorical + if strings_to_categorical or name in categories: + continue + try: + if pa.types.is_dictionary(table.schema.field(name).type): + continue + except KeyError: + pass if hasattr(pandas_dtype, "__from_arrow__"): ext_columns[name] = pandas_dtype + # for pandas 3.0+, use pandas' new default string dtype + if _pandas_api.uses_string_dtype() and not strings_to_categorical: + for field in table.schema: + if field.name not in ext_columns and ( + pa.types.is_string(field.type) + or pa.types.is_large_string(field.type) + or pa.types.is_string_view(field.type) + ) and field.name not in categories: + ext_columns[field.name] = _pandas_api.pd.StringDtype(na_value=np.nan) + return ext_columns @@ -1049,9 +1081,9 @@ def get_pandas_logical_type_map(): 'date': 'datetime64[D]', 'datetime': 'datetime64[ns]', 'datetimetz': 'datetime64[ns]', - 'unicode': np.str_, + 'unicode': 'str', 'bytes': np.bytes_, - 'string': np.str_, + 'string': 'str', 'integer': np.int64, 'floating': np.float64, 'decimal': np.object_, @@ -1142,6 +1174,20 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # GH-41503: if the column index was decimal, restore to decimal elif pandas_dtype == "decimal": level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level]) + elif ( + level.dtype == "str" and numpy_dtype == "object" + and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", "string"]) + ): + # the metadata indicate that the original dataframe used object dtype, + # but ignore this and keep string dtype if: + # - the original columns used mixed types -> we don't attempt to faithfully + # roundtrip in this case, but keep the column names as strings + # - the original columns were inferred to be strings but stored in object + # dtype -> we don't restore the object dtype because all metadata + # generated using pandas < 3 will have this case by default, and + # for pandas >= 3 we want to use the default string dtype for .columns + new_levels.append(level) + continue elif level.dtype != dtype: level = level.astype(dtype) # ARROW-9096: if original DataFrame was upcast we keep that diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index e6fcd6149ee..6f28205a18e 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1020,7 +1020,7 @@ def test_replace_slice(): offsets = range(-3, 4) arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde']) - series = arr.to_pandas() + series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') @@ -1031,7 +1031,7 @@ def test_replace_slice(): assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde']) - series = arr.to_pandas() + series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') @@ -2132,7 +2132,8 @@ def test_strftime(): for fmt in formats: options = pc.StrftimeOptions(fmt) result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)) + # cast to the same type as result to ignore string vs large_string + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) fmt = "%Y-%m-%dT%H:%M:%S" @@ -2140,34 +2141,34 @@ def test_strftime(): # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions()) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Default format plus timezone tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) - expected = pa.array(ts.strftime(fmt + "%Z")) + expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S")) + expected = pa.array(ts.strftime("%S")).cast(result.type) assert result.equals(expected) # Pandas %S.%f is equivalent to %S in arrow for unit="us" tsa = pa.array(ts, type=pa.timestamp("us", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S.%f")) + expected = pa.array(ts.strftime("%S.%f")).cast(result.type) assert result.equals(expected) # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions(fmt, locale="C") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Test timestamps without timezone @@ -2175,7 +2176,7 @@ def test_strftime(): ts = pd.to_datetime(times) tsa = pa.array(ts, type=pa.timestamp("s")) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) # Positional format assert pc.strftime(tsa, fmt) == result diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 18c8cd5b654..249fb621279 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -426,7 +426,11 @@ def test_empty_strings(version): @pytest.mark.pandas def test_all_none(version): df = pd.DataFrame({'all_none': [None] * 10}) - _check_pandas_roundtrip(df, version=version) + if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype(): + expected = df.astype("str") + else: + expected = df + _check_pandas_roundtrip(df, version=version, expected=expected) @pytest.mark.pandas diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index d5c936df072..f356874c576 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -349,6 +349,17 @@ def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) _check_pandas_roundtrip(df, preserve_index=True) + def test_float_column_index_with_missing(self): + df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=[1.5, np.nan]) + _check_pandas_roundtrip(df, preserve_index=True) + + @pytest.mark.filterwarnings( + "ignore:The DataFrame has column names of mixed type:UserWarning" + ) + def test_string_column_index_with_missing(self): + df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=["A", None]) + _check_pandas_roundtrip(df, preserve_index=True) + def test_index_metadata_field_name(self): # test None case, and strangely named non-index columns df = pd.DataFrame( @@ -359,8 +370,11 @@ def test_index_metadata_field_name(self): ), columns=['a', None, '__index_level_0__'], ) - with pytest.warns(UserWarning): + if _pandas_api.uses_string_dtype(): t = pa.Table.from_pandas(df, preserve_index=True) + else: + with pytest.warns(UserWarning): + t = pa.Table.from_pandas(df, preserve_index=True) js = t.schema.pandas_metadata col1, col2, col3, idx0, foo = js['columns'] @@ -368,8 +382,12 @@ def test_index_metadata_field_name(self): assert col1['name'] == 'a' assert col1['name'] == col1['field_name'] - assert col2['name'] is None - assert col2['field_name'] == 'None' + if _pandas_api.uses_string_dtype(): + assert np.isnan(col2['name']) + assert col2['field_name'] == 'nan' + else: + assert col2['name'] is None + assert col2['field_name'] == 'None' assert col3['name'] == '__index_level_0__' assert col3['name'] == col3['field_name'] @@ -411,7 +429,9 @@ def test_string_column_index(self): column_indexes, = js['column_indexes'] assert column_indexes['name'] == 'stringz' assert column_indexes['name'] == column_indexes['field_name'] - assert column_indexes['numpy_type'] == 'object' + assert column_indexes['numpy_type'] == ( + 'str' if _pandas_api.uses_string_dtype() else 'object' + ) assert column_indexes['pandas_type'] == 'unicode' md = column_indexes['metadata'] @@ -1680,7 +1700,10 @@ def test_pandas_unicode(self): repeats = 1000 values = ['foo', None, 'bar', 'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) - field = pa.field('strings', pa.string()) + field = pa.field( + 'strings', + pa.large_string() if _pandas_api.uses_string_dtype() else pa.string() + ) schema = pa.schema([field]) ex_values = ['foo', None, 'bar', 'mañana', None] expected = pd.DataFrame({'strings': ex_values * repeats}) @@ -3323,6 +3346,10 @@ def _assert_nunique(obj, expected): def test_to_pandas_deduplicate_strings_array_types(): + if _pandas_api.uses_string_dtype(): + pytest.skip( + "pandas uses string dtype and not object dtype, keyword has no effect" + ) nunique = 100 repeats = 10 values = _generate_dedup_example(nunique, repeats) @@ -3335,6 +3362,10 @@ def test_to_pandas_deduplicate_strings_array_types(): def test_to_pandas_deduplicate_strings_table_types(): + if _pandas_api.uses_string_dtype(): + pytest.skip( + "pandas uses string dtype and not object dtype, keyword has no effect" + ) nunique = 100 repeats = 10 values = _generate_dedup_example(nunique, repeats) @@ -3798,20 +3829,26 @@ def _check_to_pandas_memory_unchanged(obj, **kwargs): x = obj.to_pandas(**kwargs) # noqa # Memory allocation unchanged -- either zero copy or self-destructing - assert pa.total_allocated_bytes() == prior_allocation + if _pandas_api.uses_string_dtype(): + # for the string array of the columns Index + # -> increase the size to account for overallocation for small arrays + max_index_allocation = max(192, x.columns.nbytes * 2) + assert pa.total_allocated_bytes() <= (prior_allocation + max_index_allocation) + else: + assert pa.total_allocated_bytes() == prior_allocation def test_to_pandas_split_blocks(): # ARROW-3789 t = pa.table([ - pa.array([1, 2, 3, 4, 5], type='i1'), - pa.array([1, 2, 3, 4, 5], type='i4'), - pa.array([1, 2, 3, 4, 5], type='i8'), - pa.array([1, 2, 3, 4, 5], type='f4'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='i1'), + pa.array([1, 2, 3, 4, 5]*100, type='i4'), + pa.array([1, 2, 3, 4, 5]*100, type='i8'), + pa.array([1, 2, 3, 4, 5]*100, type='f4'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), ], ['f{}'.format(i) for i in range(8)]) _check_blocks_created(t, 8) @@ -3856,7 +3893,12 @@ def test_table_uses_memory_pool(): prior_allocation = pa.total_allocated_bytes() x = t.to_pandas() - assert pa.total_allocated_bytes() == (prior_allocation + 3 * N * 8) + new_allocation = 3 * N * 8 + if _pandas_api.uses_string_dtype(): + # for the small columns Index + new_allocation += 128 + + assert pa.total_allocated_bytes() == (prior_allocation + new_allocation) # Check successful garbage collection x = None # noqa @@ -4134,7 +4176,10 @@ def test_dictionary_encoded_nested_to_pandas(): def test_dictionary_from_pandas(): cat = pd.Categorical(['a', 'b', 'a']) - expected_type = pa.dictionary(pa.int8(), pa.string()) + expected_type = pa.dictionary( + pa.int8(), + pa.large_string() if _pandas_api.uses_string_dtype() else pa.string() + ) result = pa.array(cat) assert result.to_pylist() == ['a', 'b', 'a']