diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index 47ef34d9a47..1c7e65cf27f 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -60,7 +60,7 @@ if [ "${ARCHERY_INTEGRATION_WITH_JAVA}" -gt "0" ]; then export JAVA_JNI_CMAKE_ARGS="-DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF -DARROW_JAVA_JNI_ENABLE_C=ON" ${arrow_dir}/java/ci/scripts/jni_build.sh "${arrow_dir}/java" "${ARROW_HOME}" "${build_dir}/java/" /tmp/dist/java - ${arrow_dir}/java/ci/scripts/java_build.sh "${arrow_dir}/java" "${build_dir}/java" /tmp/dist/java + ${arrow_dir}/java/ci/scripts/build.sh "${arrow_dir}/java" "${build_dir}/java" /tmp/dist/java fi github_actions_group_end diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index c9420103968..edaf28cd92a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -271,6 +271,7 @@ class PARQUET_EXPORT WriterProperties { created_by_(properties.created_by()), store_decimal_as_integer_(properties.store_decimal_as_integer()), page_checksum_enabled_(properties.page_checksum_enabled()), + size_statistics_level_(properties.size_statistics_level()), sorting_columns_(properties.sorting_columns()), default_column_properties_(properties.default_column_properties()) {} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 9f04d33f83c..c43df2b6f25 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1430,6 +1430,12 @@ tasks: # ensure we have at least one build with parquet encryption disabled PARQUET_REQUIRE_ENCRYPTION: "OFF" {% endif %} + {% if pandas_version == "nightly" %} + # TODO can be removed once this is enabled by default in pandas >= 3 + # This is to enable the Pandas feature. + # See: https://github.com/pandas-dev/pandas/pull/58459 + PANDAS_FUTURE_INFER_STRING: "1" + {% endif %} {% if not cache_leaf %} # use the latest pandas release, so prevent reusing any cached layers flags: --no-leaf-cache diff --git a/docker-compose.yml b/docker-compose.yml index bd912095633..b70d924da13 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1326,6 +1326,7 @@ services: PYTEST_ARGS: # inherit HYPOTHESIS_PROFILE: # inherit PYARROW_TEST_HYPOTHESIS: # inherit + PANDAS_FUTURE_INFER_STRING: # inherit volumes: *conda-volumes command: *python-conda-command diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 52f4a751dcc..605a1adbe10 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -24,7 +24,10 @@ Release Management Guide This page provides detailed information on the steps followed to perform a release. It can be used both as a guide to learn the Apache Arrow release process and as a comprehensive checklist for the Release Manager when -performing a release. +performing a release. The person acting as Release Manager must at least have +committer status in order to perform the tasks below. If the Release Manager is +a committer but not a member of the PMC, some tasks will need to be delegated +to a PMC member and these are marked below accordingly. Principles ========== @@ -36,8 +39,15 @@ Preparing for the release ========================= Before creating a source release, the Release Manager must ensure that any -resolved JIRAs have the appropriate Fix Version set so that the changelog is -generated properly. +resolved GitHub issues have the appropriate milestone set so that the changelog +is generated properly. + +Note that pull requests without a corresponding GitHub issue won't be detected +by the cherry-pick script and must be cherry-picked manually by the release +manager onto the maintenance branch. Examples include MINOR and Dependabot pull +requests. For this reason, it's encouraged to avoid the need for manual +cherry-picking by creating issues for any pull requests that are merged to the +default branch after the release maintenance branch has been created. .. dropdown:: Requirements :animate: fade-in-slide-down @@ -67,7 +77,8 @@ generated properly. Before creating a Release Candidate =================================== -Ensure local tags are removed, gpg-agent is set and JIRA tickets are correctly assigned. +Ensure local tags are removed, gpg-agent is set and GitHub issues are correctly +assigned. .. code-block:: @@ -78,7 +89,8 @@ Ensure local tags are removed, gpg-agent is set and JIRA tickets are correctly a source dev/release/setup-gpg-agent.sh # Curate the release - # The end of the generated report shows the JIRA tickets with wrong version number assigned. + # The end of the generated report shows any GitHub issues with the wrong + # version number assigned. archery release curate Ensure a major version milestone for a follow up release is created on GitHub. This will @@ -149,7 +161,7 @@ Create or update the corresponding maintenance branch # This will create a branch locally called maint-X.Y.Z. # X.Y.Z corresponds with the Major, Minor and Patch version number # of the release respectively. As an example 9.0.0 - archery release --jira-cache /tmp/jiracache cherry-pick X.Y.Z --execute + archery release cherry-pick X.Y.Z --execute # Push the maintenance branch to the remote repository git push -u apache maint-X.Y.Z @@ -158,14 +170,30 @@ Create or update the corresponding maintenance branch .. code-block:: # First run in dry-mode to see which commits will be cherry-picked. - # If there are commits that we don't want to get applied ensure the version on - # JIRA is set to the following release. - archery release --jira-cache /tmp/jiracache cherry-pick X.Y.Z --continue + # If there are commits that we don't want to get applied, ensure the + # milestone on GitHub is set to the following release. + archery release cherry-pick X.Y.Z --continue # Update the maintenance branch with the previous commits - archery release --jira-cache /tmp/jiracache cherry-pick X.Y.Z --continue --execute + archery release cherry-pick X.Y.Z --continue --execute # Push the updated maintenance branch to the remote repository git push -u apache maint-X.Y.Z +Optional: Test Before Creating a Release Candidate +-------------------------------------------------- + +Some release managers prefer to perform testing before creating the first +release candidate to avoid the need to create multiple release candidates within +a given release. + +To test before creating a release candiate: + +* Create a pull request from the up-to-date maint-X.Y.Z branch onto main +* Title the pull request "WIP: Dummy PR to check maint-X.Y.Z status" +* Comment on the pull request to trigger the relevant Crossbow jobs: + + * ``@github-actions crossbow submit --group verify-rc-source`` + * ``@github-actions crossbow submit --group packaging`` + Create the Release Candidate branch from the updated maintenance branch ----------------------------------------------------------------------- @@ -178,12 +206,12 @@ Create the Release Candidate branch from the updated maintenance branch # place the necessary commits updating the version number and then create a git tag # on OSX use gnu-sed with homebrew: brew install gnu-sed (and export to $PATH) # - # starts at 0 and increments every time the Release Candidate is burned + # starts at 0 and increments every time the Release Candidate is created # so for the first RC this would be: dev/release/01-prepare.sh 4.0.0 5.0.0 0 dev/release/01-prepare.sh # Push the release candidate tag - git push -u apache apache-arrow-rc + git push -u apache apache-arrow--rc # Push the release candidate branch in order to trigger verification jobs later git push -u apache release--rc @@ -194,6 +222,7 @@ Build source and binaries and submit them # Build the source release tarball and create Pull Request with verification tasks # + # NOTE: This must be run by a PMC member # NOTE: You need to have GitHub CLI installed to run this script. dev/release/02-source.sh @@ -209,13 +238,16 @@ Build source and binaries and submit them # Sign and upload the binaries # + # NOTE: This must be run by a PMC member + # # On macOS the only way I could get this to work was running "echo "UPDATESTARTUPTTY" | gpg-connect-agent" before running this comment # otherwise I got errors referencing "ioctl" errors. dev/release/05-binary-upload.sh # Sign and upload MATLAB artifacts to the GitHub Releases area. # - # Note that you need to have GitHub CLI installed to run this script. + # NOTE: This must be run by a PMC member + # NOTE: You need to have GitHub CLI installed to run this script. dev/release/06-matlab-upload.sh # Start verifications for binaries and wheels @@ -246,8 +278,6 @@ After the release vote, we must undertake many tasks to update source artifacts, Be sure to go through on the following checklist: #. Update the released milestone Date and set to "Closed" on GitHub -#. Make the CPP PARQUET related version as "RELEASED" on JIRA -#. Start the new version on JIRA for the related CPP PARQUET version #. Merge changes on release branch to maintenance branch for patch releases #. Add the new release to the Apache Reporter System #. Push release tag @@ -266,7 +296,6 @@ Be sure to go through on the following checklist: #. Update vcpkg port #. Update Conan recipe #. Bump versions -#. Update tags for Go modules #. Update docs #. Update version in Apache Arrow Cookbook #. Announce the new release @@ -274,28 +303,6 @@ Be sure to go through on the following checklist: #. Announce the release on Twitter #. Remove old artifacts -.. dropdown:: Mark the released version as "RELEASED" on JIRA - :animate: fade-in-slide-down - :class-title: sd-fs-5 - :class-container: sd-shadow-md - - - Open https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/administer-versions - - Click "..." for the release version in "Actions" column - - Select "Release" - - Set "Release date" - - Click "Release" button - -.. dropdown:: Start the new version on JIRA - :animate: fade-in-slide-down - :class-title: sd-fs-5 - :class-container: sd-shadow-md - - - Open https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/administer-versions - - Click "..." for the next version in "Actions" column - - Select "Edit" - - Set "Start date" - - Click "Save" button - .. dropdown:: Merge changes on release branch to maintenance branch for patch releases :animate: fade-in-slide-down :class-title: sd-fs-5 @@ -588,7 +595,7 @@ Be sure to go through on the following checklist: :class-title: sd-fs-5 :class-container: sd-shadow-md - Open a pull request to vcpkg: + Open a pull request to Conan: .. code-block:: Bash @@ -604,8 +611,8 @@ Be sure to go through on the following checklist: git remote add upstream https://github.com/conan-io/conan-center-index.git cd - - # dev/release/post-17-conan.sh 10.0.1 ../conan-center-index - dev/release/post-17-conan.sh X.Y.Z + # dev/release/post-16-conan.sh 10.0.1 ../conan-center-index + dev/release/post-16-conan.sh X.Y.Z This script pushes a ``arrow-X.Y.Z`` branch to your ``conan-io/conan-center-index`` fork. You need to create a pull request from the ``arrow-X.Y.Z`` branch on your Web browser. @@ -627,7 +634,8 @@ Be sure to go through on the following checklist: :class-title: sd-fs-5 :class-container: sd-shadow-md - The documentations are generated in the release process. We just need to upload the generated documentations: + Documentation is generated as part of the release process. We just need to + upload the generated documentation: .. code-block:: Bash @@ -650,7 +658,8 @@ Be sure to go through on the following checklist: :class-title: sd-fs-5 :class-container: sd-shadow-md - TODO + Follow `the documentation `_ + in the Apache Arrow Cookbook repository .. dropdown:: Announce the new release :animate: fade-in-slide-down @@ -666,16 +675,38 @@ Be sure to go through on the following checklist: :class-title: sd-fs-5 :class-container: sd-shadow-md - TODO + The blog post process isn't automated. The rough set of steps we usually take + are: -.. dropdown:: Announce the release on Twitter + * Clone https://github.com/apache/arrow-site. + * Create a new branch off ``main`` for the blog post pull request we're + creating. + * Duplicate a recent blog post entry in the ``_posts`` subfolder and update + the filename and YAML metadata. + + * Set the date in the filename and in the YAML metadata to the date that the + release candidate vote thread for the release closed (in GMT). + + * *For minor releases only*, remove any section about community updates (new + committers, PMC members, etc). + * Update the remainder of the text as needed + * Create the pull request + * In the pull request, ping contributors in each section requesting help + filling in the details for each section. + + +.. dropdown:: Announce the release on social media :animate: fade-in-slide-down :class-title: sd-fs-5 :class-container: sd-shadow-md - Post the release blog post on Twitter from the `@ApacheArrow `_ handle. + Post about the release and link to the blog post on social media. The project + has two official accounts: + + * Twitter/X: `@ApacheArrow `_ + * LinkedIn: https://www.linkedin.com/company/apache-arrow/ - PMC members have access or can request access, after which they can post via `TweetDeck `_. + PMC members have access or can request access to post under these accounts. .. dropdown:: Remove old artifacts :animate: fade-in-slide-down @@ -687,3 +718,5 @@ Be sure to go through on the following checklist: .. code-block:: Bash dev/release/post-09-remove-old-artifacts.sh + + Note: This step must be done by a PMC member. diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f86caf1433d..2ef42051d9a 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -117,6 +117,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size): "return a pyarrow Array or ChunkedArray.") if isinstance(res, ChunkedArray) and res.num_chunks==1: res = res.chunk(0) + if type is not None and res.type != type: + res = res.cast(type) return res diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 74f0d981b52..5be6f03f86e 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype, _lock bint has_sparse bint _pd024 - bint _is_v1, _is_ge_v21, _is_ge_v3 + bint _is_v1, _is_ge_v21, _is_ge_v3, _is_ge_v3_strict def __init__(self): self._lock = Lock() @@ -80,6 +80,7 @@ cdef class _PandasAPIShim(object): self._is_v1 = self._loose_version < Version('2.0.0') self._is_ge_v21 = self._loose_version >= Version('2.1.0') self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0') + self._is_ge_v3_strict = self._loose_version >= Version('3.0.0') self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -174,6 +175,20 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_ge_v3 + def is_ge_v3_strict(self): + self._check_import() + return self._is_ge_v3_strict + + def uses_string_dtype(self): + if self.is_ge_v3_strict(): + return True + try: + if self.pd.options.future.infer_string: + return True + except: + pass + return False + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index d0582f825b5..e9655914ad7 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -174,7 +174,11 @@ def get_column_metadata(column, name, arrow_type, field_name): } string_dtype = 'object' - if name is not None and not isinstance(name, str): + if ( + name is not None + and not (isinstance(name, float) and np.isnan(name)) + and not isinstance(name, str) + ): raise TypeError( 'Column name must be a string. Got column {} of type {}'.format( name, type(name).__name__ @@ -340,8 +344,8 @@ def _column_name_to_strings(name): return str(tuple(map(_column_name_to_strings, name))) elif isinstance(name, Sequence): raise TypeError("Unsupported type for MultiIndex level") - elif name is None: - return None + elif name is None or (isinstance(name, float) and np.isnan(name)): + return name return str(name) @@ -790,10 +794,12 @@ def table_to_dataframe( table, index = _reconstruct_index(table, index_descriptors, all_columns, types_mapper) ext_columns_dtypes = _get_extension_dtypes( - table, all_columns, types_mapper) + table, all_columns, types_mapper, options, categories) else: index = _pandas_api.pd.RangeIndex(table.num_rows) - ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper) + ext_columns_dtypes = _get_extension_dtypes( + table, [], types_mapper, options, categories + ) _check_data_column_metadata_consistency(all_columns) columns = _deserialize_column_index(table, all_columns, column_indexes) @@ -838,7 +844,7 @@ def table_to_dataframe( } -def _get_extension_dtypes(table, columns_metadata, types_mapper=None): +def _get_extension_dtypes(table, columns_metadata, types_mapper, options, categories): """ Based on the stored column pandas metadata and the extension types in the arrow schema, infer which columns should be converted to a @@ -851,6 +857,9 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): and then we can check if this dtype supports conversion from arrow. """ + strings_to_categorical = options["strings_to_categorical"] + categories = categories or [] + ext_columns = {} # older pandas version that does not yet support extension dtypes @@ -889,9 +898,32 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): # that are certainly numpy dtypes pandas_dtype = _pandas_api.pandas_dtype(dtype) if isinstance(pandas_dtype, _pandas_api.extension_dtype): + if isinstance(pandas_dtype, _pandas_api.pd.StringDtype): + # when the metadata indicate to use the string dtype, + # ignore this in case: + # - it is specified to convert strings / this column to categorical + # - the column itself is dictionary encoded and would otherwise be + # converted to categorical + if strings_to_categorical or name in categories: + continue + try: + if pa.types.is_dictionary(table.schema.field(name).type): + continue + except KeyError: + pass if hasattr(pandas_dtype, "__from_arrow__"): ext_columns[name] = pandas_dtype + # for pandas 3.0+, use pandas' new default string dtype + if _pandas_api.uses_string_dtype() and not strings_to_categorical: + for field in table.schema: + if field.name not in ext_columns and ( + pa.types.is_string(field.type) + or pa.types.is_large_string(field.type) + or pa.types.is_string_view(field.type) + ) and field.name not in categories: + ext_columns[field.name] = _pandas_api.pd.StringDtype(na_value=np.nan) + return ext_columns @@ -1049,9 +1081,9 @@ def get_pandas_logical_type_map(): 'date': 'datetime64[D]', 'datetime': 'datetime64[ns]', 'datetimetz': 'datetime64[ns]', - 'unicode': np.str_, + 'unicode': 'str', 'bytes': np.bytes_, - 'string': np.str_, + 'string': 'str', 'integer': np.int64, 'floating': np.float64, 'decimal': np.object_, @@ -1142,6 +1174,20 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # GH-41503: if the column index was decimal, restore to decimal elif pandas_dtype == "decimal": level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level]) + elif ( + level.dtype == "str" and numpy_dtype == "object" + and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", "string"]) + ): + # the metadata indicate that the original dataframe used object dtype, + # but ignore this and keep string dtype if: + # - the original columns used mixed types -> we don't attempt to faithfully + # roundtrip in this case, but keep the column names as strings + # - the original columns were inferred to be strings but stored in object + # dtype -> we don't restore the object dtype because all metadata + # generated using pandas < 3 will have this case by default, and + # for pandas >= 3 we want to use the default string dtype for .columns + new_levels.append(level) + continue elif level.dtype != dtype: level = level.astype(dtype) # ARROW-9096: if original DataFrame was upcast we keep that diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 10c4d0e1600..a0f1d5bbbed 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -2523,7 +2523,8 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr } if (options.strings_to_categorical) { for (int i = 0; i < static_cast(arrays->size()); i++) { - if (is_base_binary_like((*arrays)[i]->type()->id())) { + if (is_base_binary_like((*arrays)[i]->type()->id()) || + is_binary_view_like((*arrays)[i]->type()->id())) { columns_to_encode.push_back(i); } } @@ -2557,7 +2558,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options, py_ref = nullptr; } - if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) { + if (options.strings_to_categorical && (is_base_binary_like(arr->type()->id()) || + is_binary_view_like(arr->type()->id()))) { if (options.zero_copy_only) { return Status::Invalid("Need to dictionary encode a column, but ", "only zero-copy conversions allowed"); diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index e6fcd6149ee..6f28205a18e 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1020,7 +1020,7 @@ def test_replace_slice(): offsets = range(-3, 4) arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde']) - series = arr.to_pandas() + series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') @@ -1031,7 +1031,7 @@ def test_replace_slice(): assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde']) - series = arr.to_pandas() + series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') @@ -2132,7 +2132,8 @@ def test_strftime(): for fmt in formats: options = pc.StrftimeOptions(fmt) result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)) + # cast to the same type as result to ignore string vs large_string + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) fmt = "%Y-%m-%dT%H:%M:%S" @@ -2140,34 +2141,34 @@ def test_strftime(): # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions()) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Default format plus timezone tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) - expected = pa.array(ts.strftime(fmt + "%Z")) + expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S")) + expected = pa.array(ts.strftime("%S")).cast(result.type) assert result.equals(expected) # Pandas %S.%f is equivalent to %S in arrow for unit="us" tsa = pa.array(ts, type=pa.timestamp("us", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S.%f")) + expected = pa.array(ts.strftime("%S.%f")).cast(result.type) assert result.equals(expected) # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions(fmt, locale="C") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Test timestamps without timezone @@ -2175,7 +2176,7 @@ def test_strftime(): ts = pd.to_datetime(times) tsa = pa.array(ts, type=pa.timestamp("s")) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) # Positional format assert pc.strftime(tsa, fmt) == result diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 18c8cd5b654..249fb621279 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -426,7 +426,11 @@ def test_empty_strings(version): @pytest.mark.pandas def test_all_none(version): df = pd.DataFrame({'all_none': [None] * 10}) - _check_pandas_roundtrip(df, version=version) + if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype(): + expected = df.astype("str") + else: + expected = df + _check_pandas_roundtrip(df, version=version, expected=expected) @pytest.mark.pandas diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 1186f87b032..f356874c576 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -349,6 +349,17 @@ def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) _check_pandas_roundtrip(df, preserve_index=True) + def test_float_column_index_with_missing(self): + df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=[1.5, np.nan]) + _check_pandas_roundtrip(df, preserve_index=True) + + @pytest.mark.filterwarnings( + "ignore:The DataFrame has column names of mixed type:UserWarning" + ) + def test_string_column_index_with_missing(self): + df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=["A", None]) + _check_pandas_roundtrip(df, preserve_index=True) + def test_index_metadata_field_name(self): # test None case, and strangely named non-index columns df = pd.DataFrame( @@ -359,8 +370,11 @@ def test_index_metadata_field_name(self): ), columns=['a', None, '__index_level_0__'], ) - with pytest.warns(UserWarning): + if _pandas_api.uses_string_dtype(): t = pa.Table.from_pandas(df, preserve_index=True) + else: + with pytest.warns(UserWarning): + t = pa.Table.from_pandas(df, preserve_index=True) js = t.schema.pandas_metadata col1, col2, col3, idx0, foo = js['columns'] @@ -368,8 +382,12 @@ def test_index_metadata_field_name(self): assert col1['name'] == 'a' assert col1['name'] == col1['field_name'] - assert col2['name'] is None - assert col2['field_name'] == 'None' + if _pandas_api.uses_string_dtype(): + assert np.isnan(col2['name']) + assert col2['field_name'] == 'nan' + else: + assert col2['name'] is None + assert col2['field_name'] == 'None' assert col3['name'] == '__index_level_0__' assert col3['name'] == col3['field_name'] @@ -411,7 +429,9 @@ def test_string_column_index(self): column_indexes, = js['column_indexes'] assert column_indexes['name'] == 'stringz' assert column_indexes['name'] == column_indexes['field_name'] - assert column_indexes['numpy_type'] == 'object' + assert column_indexes['numpy_type'] == ( + 'str' if _pandas_api.uses_string_dtype() else 'object' + ) assert column_indexes['pandas_type'] == 'unicode' md = column_indexes['metadata'] @@ -1680,7 +1700,10 @@ def test_pandas_unicode(self): repeats = 1000 values = ['foo', None, 'bar', 'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) - field = pa.field('strings', pa.string()) + field = pa.field( + 'strings', + pa.large_string() if _pandas_api.uses_string_dtype() else pa.string() + ) schema = pa.schema([field]) ex_values = ['foo', None, 'bar', 'mañana', None] expected = pd.DataFrame({'strings': ex_values * repeats}) @@ -1836,10 +1859,13 @@ def test_to_pandas_categories_already_dictionary(self): result = table.to_pandas(categories=['col']) assert table.to_pandas().equals(result) - def test_table_str_to_categorical_without_na(self): + @pytest.mark.parametrize( + "string_type", [pa.string(), pa.large_string(), pa.string_view()] + ) + def test_table_str_to_categorical_without_na(self, string_type): values = ['a', 'a', 'b', 'b', 'c'] df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) + field = pa.field('strings', string_type) schema = pa.schema([field]) table = pa.Table.from_pandas(df, schema=schema) @@ -1851,10 +1877,22 @@ def test_table_str_to_categorical_without_na(self): table.to_pandas(strings_to_categorical=True, zero_copy_only=True) - def test_table_str_to_categorical_with_na(self): + # chunked array + result = table["strings"].to_pandas(strings_to_categorical=True) + expected = pd.Series(pd.Categorical(values), name="strings") + tm.assert_series_equal(result, expected) + + with pytest.raises(pa.ArrowInvalid): + table["strings"].to_pandas(strings_to_categorical=True, + zero_copy_only=True) + + @pytest.mark.parametrize( + "string_type", [pa.string(), pa.large_string(), pa.string_view()] + ) + def test_table_str_to_categorical_with_na(self, string_type): values = [None, 'a', 'b', np.nan] df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) + field = pa.field('strings', string_type) schema = pa.schema([field]) table = pa.Table.from_pandas(df, schema=schema) @@ -1866,6 +1904,15 @@ def test_table_str_to_categorical_with_na(self): table.to_pandas(strings_to_categorical=True, zero_copy_only=True) + # chunked array + result = table["strings"].to_pandas(strings_to_categorical=True) + expected = pd.Series(pd.Categorical(values), name="strings") + tm.assert_series_equal(result, expected) + + with pytest.raises(pa.ArrowInvalid): + table["strings"].to_pandas(strings_to_categorical=True, + zero_copy_only=True) + # Regression test for ARROW-2101 def test_array_of_bytes_to_strings(self): converted = pa.array(np.array([b'x'], dtype=object), pa.string()) @@ -3299,6 +3346,10 @@ def _assert_nunique(obj, expected): def test_to_pandas_deduplicate_strings_array_types(): + if _pandas_api.uses_string_dtype(): + pytest.skip( + "pandas uses string dtype and not object dtype, keyword has no effect" + ) nunique = 100 repeats = 10 values = _generate_dedup_example(nunique, repeats) @@ -3311,6 +3362,10 @@ def test_to_pandas_deduplicate_strings_array_types(): def test_to_pandas_deduplicate_strings_table_types(): + if _pandas_api.uses_string_dtype(): + pytest.skip( + "pandas uses string dtype and not object dtype, keyword has no effect" + ) nunique = 100 repeats = 10 values = _generate_dedup_example(nunique, repeats) @@ -3774,20 +3829,26 @@ def _check_to_pandas_memory_unchanged(obj, **kwargs): x = obj.to_pandas(**kwargs) # noqa # Memory allocation unchanged -- either zero copy or self-destructing - assert pa.total_allocated_bytes() == prior_allocation + if _pandas_api.uses_string_dtype(): + # for the string array of the columns Index + # -> increase the size to account for overallocation for small arrays + max_index_allocation = max(192, x.columns.nbytes * 2) + assert pa.total_allocated_bytes() <= (prior_allocation + max_index_allocation) + else: + assert pa.total_allocated_bytes() == prior_allocation def test_to_pandas_split_blocks(): # ARROW-3789 t = pa.table([ - pa.array([1, 2, 3, 4, 5], type='i1'), - pa.array([1, 2, 3, 4, 5], type='i4'), - pa.array([1, 2, 3, 4, 5], type='i8'), - pa.array([1, 2, 3, 4, 5], type='f4'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='i1'), + pa.array([1, 2, 3, 4, 5]*100, type='i4'), + pa.array([1, 2, 3, 4, 5]*100, type='i8'), + pa.array([1, 2, 3, 4, 5]*100, type='f4'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), ], ['f{}'.format(i) for i in range(8)]) _check_blocks_created(t, 8) @@ -3832,7 +3893,12 @@ def test_table_uses_memory_pool(): prior_allocation = pa.total_allocated_bytes() x = t.to_pandas() - assert pa.total_allocated_bytes() == (prior_allocation + 3 * N * 8) + new_allocation = 3 * N * 8 + if _pandas_api.uses_string_dtype(): + # for the small columns Index + new_allocation += 128 + + assert pa.total_allocated_bytes() == (prior_allocation + new_allocation) # Check successful garbage collection x = None # noqa @@ -4110,7 +4176,10 @@ def test_dictionary_encoded_nested_to_pandas(): def test_dictionary_from_pandas(): cat = pd.Categorical(['a', 'b', 'a']) - expected_type = pa.dictionary(pa.int8(), pa.string()) + expected_type = pa.dictionary( + pa.int8(), + pa.large_string() if _pandas_api.uses_string_dtype() else pa.string() + ) result = pa.array(cat) assert result.to_pylist() == ['a', 'b', 'a']