diff --git a/.gitattributes b/.gitattributes index d94c19e7edb1f..bc7dec642df0f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -84,5 +84,3 @@ pandas/tests/io/parser/data export-ignore # Include cibw script in sdist since it's needed for building wheels scripts/cibw_before_build.sh -export-ignore -scripts/cibw_before_build_windows.sh -export-ignore -scripts/cibw_before_test_windows.sh -export-ignore diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e430681225cd9..3a7c71af02bf9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -9,7 +9,6 @@ doc/cheatsheet @Dr-Irv doc/source/development @noatamir # pandas -pandas/_libs/ @WillAyd pandas/_typing.py @Dr-Irv pandas/core/groupby/* @rhshadrach pandas/io/excel/* @rhshadrach diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml index 6e6cd78ace11d..9c15218794499 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yaml +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -31,7 +31,7 @@ body: attributes: label: Feature Description description: > - Please describe how the new feature would be implemented, using psudocode if relevant. + Please describe how the new feature would be implemented, using pseudocode if relevant. placeholder: > Add a new parameter to DataFrame, to_series, to return a Series if possible. diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index b92bacd1a537c..2d208cb38725a 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -4,6 +4,9 @@ inputs: editable: description: Whether to build pandas in editable mode (default true) default: true + werror: + description: Enable werror flag for build + default: true runs: using: composite steps: @@ -26,9 +29,9 @@ runs: run: | if [[ ${{ inputs.editable }} == "true" ]]; then pip install -e . --no-build-isolation -v --no-deps \ - -Csetup-args="--werror" + ${{ inputs.werror == 'true' && '-Csetup-args="--werror"' || '' }} else pip install . --no-build-isolation -v --no-deps \ - -Csetup-args="--werror" + ${{ inputs.werror == 'true' && '-Csetup-args="--werror"' || '' }} fi shell: bash -el {0} diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index e1d2d1ea846b8..728019b06e053 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -21,7 +21,7 @@ permissions: jobs: docstring_typing_manual_hooks: name: Docstring validation, typing, and other manual pre-commit hooks - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 defaults: run: shell: bash -el {0} @@ -102,7 +102,7 @@ jobs: asv-benchmarks: name: ASV Benchmarks - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 defaults: run: shell: bash -el {0} @@ -133,7 +133,7 @@ jobs: build_docker_dev_environment: name: Build Docker Dev Environment - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 defaults: run: shell: bash -el {0} @@ -160,7 +160,7 @@ jobs: requirements-dev-text-installable: name: Test install requirements-dev.txt - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4d0066bc0b48d..44a9b4bfa20b8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -13,7 +13,7 @@ permissions: jobs: analyze: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 permissions: actions: read contents: read diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 62956f5825782..b843363ae8c4d 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -10,7 +10,7 @@ permissions: jobs: issue_assign: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' concurrency: group: ${{ github.actor }}-issue-assign @@ -19,7 +19,7 @@ jobs: echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees preview_docs: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 if: github.event.issue.pull_request && github.event.comment.body == '/preview' concurrency: group: ${{ github.actor }}-preview-docs @@ -29,7 +29,7 @@ jobs: previewer-server: "https://pandas.pydata.org/preview" artifact-job: "Doc Build and Upload" asv_run: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 # TODO: Support more benchmarking options later, against different branches, against self, etc if: github.event.issue.pull_request && startsWith(github.event.comment.body, '@github-actions benchmark') defaults: diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml index 3d4cab7be09c5..334a5d77b407b 100644 --- a/.github/workflows/deprecation-tracking-bot.yml +++ b/.github/workflows/deprecation-tracking-bot.yml @@ -17,7 +17,7 @@ jobs: deprecation_update: permissions: issues: write - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 env: DEPRECATION_TRACKER_ISSUE: 56596 steps: diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 294334ca1d54b..ba9e30e088c66 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -23,7 +23,7 @@ permissions: jobs: web_and_docs: name: Doc Build and Upload - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 331af6e05b650..9800cc1694313 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -21,7 +21,7 @@ defaults: jobs: pip: if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 strategy: matrix: extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "all"] @@ -50,7 +50,7 @@ jobs: shell: bash -el {0} conda_forge_recipe: if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 strategy: matrix: python-version: ['3.10', '3.11'] diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index 792afe8f4faf5..3a51dbefc6bb0 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -12,7 +12,7 @@ jobs: permissions: pull-requests: write if: github.repository_owner == 'pandas-dev' - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/stale@v9 with: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 08c41a1eeb21f..59512ddc91a8a 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -26,8 +26,8 @@ jobs: timeout-minutes: 90 strategy: matrix: - platform: [ubuntu-22.04, ubuntu-24.04-arm] - env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] + platform: [ubuntu-24.04, ubuntu-24.04-arm] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] pandas_future_infer_string: ["0"] @@ -36,11 +36,15 @@ jobs: env_file: actions-311-downstream_compat.yaml pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" - platform: ubuntu-22.04 + platform: ubuntu-24.04 - name: "Minimum Versions" env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" - platform: ubuntu-22.04 + platform: ubuntu-24.04 + - name: "Freethreading" + env_file: actions-313-freethreading.yaml + pattern: "not slow and not network and not single_cpu" + platform: ubuntu-24.04 - name: "Locale: it_IT" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -51,7 +55,7 @@ jobs: # Also install it_IT (its encoding is ISO8859-1) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "it_IT" - platform: ubuntu-22.04 + platform: ubuntu-24.04 - name: "Locale: zh_CN" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -62,30 +66,30 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" - platform: ubuntu-22.04 + platform: ubuntu-24.04 - name: "Future infer strings" env_file: actions-312.yaml pandas_future_infer_string: "1" - platform: ubuntu-22.04 + platform: ubuntu-24.04 - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" - platform: ubuntu-22.04 + platform: ubuntu-24.04 - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" - platform: ubuntu-22.04 + platform: ubuntu-24.04 - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" test_args: "-W error::DeprecationWarning -W error::FutureWarning" - platform: ubuntu-22.04 + platform: ubuntu-24.04 - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" pandas_future_infer_string: "1" - platform: ubuntu-22.04 + platform: ubuntu-24.04 fail-fast: false name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }} env: @@ -165,6 +169,9 @@ jobs: - name: Build Pandas id: build uses: ./.github/actions/build_pandas + with: + # xref https://github.com/cython/cython/issues/6870 + werror: ${{ matrix.name != 'Freethreading' }} # TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge if: ${{ matrix.name != 'Pypy' }} @@ -188,7 +195,7 @@ jobs: matrix: # Note: Don't use macOS latest since macos 14 appears to be arm64 only os: [macos-13, macos-14, windows-latest] - env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} @@ -220,7 +227,7 @@ jobs: uses: ./.github/actions/run-tests Linux-32-bit: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 container: image: quay.io/pypa/manylinux2014_i686 options: --platform linux/386 @@ -241,12 +248,14 @@ jobs: fi - name: Build environment and Run Tests # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 + # Note: Pinned to Cython 3.0.10 to avoid numerical instability in 32-bit environments + # https://github.com/pandas-dev/pandas/pull/61423 run: | /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir versioneer[toml] cython==3.0.10 python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml @@ -256,7 +265,7 @@ jobs: cancel-in-progress: true Linux-Musl: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 container: image: quay.io/pypa/musllinux_1_2_x86_64 steps: @@ -316,7 +325,7 @@ jobs: # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. - # if: false # Uncomment this to freeze the workflow, comment it to unfreeze + if: false defaults: run: shell: bash -eou pipefail {0} @@ -325,7 +334,7 @@ jobs: fail-fast: false matrix: # Separate out macOS 13 and 14, since macOS 14 is arm64 only - os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest] + os: [ubuntu-24.04, macOS-13, macOS-14, windows-latest] timeout-minutes: 90 @@ -362,48 +371,6 @@ jobs: - name: Run Tests uses: ./.github/actions/run-tests - python-freethreading: - defaults: - run: - shell: bash -eou pipefail {0} - runs-on: ubuntu-22.04 - - timeout-minutes: 90 - - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-python-freethreading-dev - cancel-in-progress: true - - env: - PYTEST_WORKERS: "auto" - PANDAS_CI: 1 - PATTERN: "not slow and not network and not clipboard and not single_cpu" - PYTEST_TARGET: pandas - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Python Free-threading Version - uses: deadsnakes/action@v3.2.0 - with: - python-version: 3.13-dev - nogil: true - - - name: Build Environment - run: | - python --version - python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython - python -m pip install versioneer[toml] python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov - python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" - python -m pip list - - - name: Run Tests - uses: ./.github/actions/run-tests - # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml emscripten: # Note: the Python version, Emscripten toolchain version are determined @@ -413,7 +380,7 @@ jobs: # The Node.js version can be determined via Pyodide: # https://pyodide.org/en/stable/usage/index.html#node-js name: Pyodide build - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-wasm diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 2dcc79085734b..4de7aec4f551a 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -40,7 +40,7 @@ jobs: (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'Build')) || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -92,26 +92,30 @@ jobs: # GitHub Actions doesn't support pairing matrix values together, let's improvise # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 buildplat: - - [ubuntu-22.04, manylinux_x86_64] - - [ubuntu-22.04, musllinux_x86_64] + - [ubuntu-24.04, manylinux_x86_64] + - [ubuntu-24.04, musllinux_x86_64] - [ubuntu-24.04-arm, manylinux_aarch64] + - [ubuntu-24.04-arm, musllinux_aarch64] - [macos-13, macosx_x86_64] # Note: M1 images on Github Actions start from macOS 14 - [macos-14, macosx_arm64] - [windows-2022, win_amd64] + - [windows-11-arm, win_arm64] # TODO: support PyPy? python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] include: - # TODO: Remove this plus installing build deps in cibw_before_build.sh - # after pandas can be built with a released NumPy/Cython - - python: ["cp313t", "3.13"] - cibw_build_frontend: 'pip; args: --no-build-isolation' # Build Pyodide wheels and upload them to Anaconda.org # NOTE: this job is similar to the one in unit-tests.yml except for the fact # that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup. - - buildplat: [ubuntu-22.04, pyodide_wasm32] + - buildplat: [ubuntu-24.04, pyodide_wasm32] python: ["cp312", "3.12"] cibw_build_frontend: 'build' + exclude: + - buildplat: [windows-11-arm, win_arm64] + python: ["cp310", "3.10"] + # BackendUnavailable: Cannot import 'mesonpy' + - buildplat: [windows-11-arm, win_arm64] + python: ["cp313t", "3.13"] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} @@ -122,6 +126,12 @@ jobs: with: fetch-depth: 0 + - name: Set up MSVC environment for ARM64 + if: matrix.buildplat[1] == 'win_arm64' + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: arm64 + # TODO: Build wheels from sdist again # There's some sort of weird race condition? # within Github that makes the sdist be missing files @@ -153,15 +163,19 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.23.1 + uses: pypa/cibuildwheel@v2.23.3 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }} - CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} + CIBW_PLATFORM: ${{ (matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide') || (matrix.buildplat[1] == 'win_arm64' && 'windows') || 'auto' }} + CIBW_ARCHS: ${{ matrix.buildplat[1] == 'win_arm64' && 'ARM64' || 'auto' }} + CIBW_BEFORE_BUILD_WINDOWS: 'python -m pip install delvewheel' - - name: Set up Python + - name: Set up Python for validation/upload (non-ARM64 Windows & other OS) + # micromamba is not available for ARM64 Windows + if: matrix.buildplat[1] != 'win_arm64' uses: mamba-org/setup-micromamba@v2 with: environment-name: wheel-env @@ -174,6 +188,12 @@ jobs: cache-downloads: true cache-environment: true + - name: Install wheel for win_arm64 + # installing wheel here because micromamba step was skipped + if: matrix.buildplat[1] == 'win_arm64' + shell: bash -el {0} + run: python -m pip install wheel + - name: Validate wheel RECORD shell: bash -el {0} run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09bfda1755e03..b5856810b749e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -minimum_pre_commit_version: 2.15.0 +minimum_pre_commit_version: 4.0.0 exclude: ^LICENSES/|\.(html|csv|svg)$ # reserve "manual" for relatively slow hooks which we still want to run in CI default_stages: [ @@ -19,13 +19,13 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.9 + rev: v0.11.12 hooks: - id: ruff args: [--exit-non-zero-on-fix] exclude: ^pandas/tests/frame/test_query_eval.py - id: ruff - # TODO: remove autofixe-only rules when they are checked by ruff + # TODO: remove autofix only rules when they are checked by ruff name: ruff-selected-autofixes alias: ruff-selected-autofixes files: ^pandas @@ -34,7 +34,7 @@ repos: - id: ruff-format exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.14' + rev: v2.14 hooks: - id: vulture entry: python scripts/run_vulture.py @@ -74,7 +74,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.19.1 + rev: v3.20.0 hooks: - id: pyupgrade args: [--py310-plus] @@ -95,14 +95,14 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.7 + rev: v20.1.5 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: https://github.com/trim21/pre-commit-mirror-meson - rev: v1.7.0 + rev: v1.8.1 hooks: - id: meson-fmt args: ['--inplace'] @@ -140,7 +140,7 @@ repos: pass_filenames: false types: [python] stages: [manual] - - id: mypy + - id: stubtest # note: assumes python env is setup and activated # note: requires pandas dev to be installed name: mypy (stubtest) diff --git a/Dockerfile b/Dockerfile index 4090a4adb1af8..e778312fd3aa2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,20 @@ FROM python:3.10.8 WORKDIR /home/pandas -RUN apt-get update && apt-get -y upgrade -RUN apt-get install -y build-essential bash-completion +RUN apt-get update && \ + apt-get --no-install-recommends -y upgrade && \ + apt-get --no-install-recommends -y install \ + build-essential \ + bash-completion \ + # hdf5 needed for pytables installation + libhdf5-dev \ + # libgles2-mesa needed for pytest-qt + libgles2-mesa-dev && \ + rm -rf /var/lib/apt/lists/* -# hdf5 needed for pytables installation -# libgles2-mesa needed for pytest-qt -RUN apt-get install -y libhdf5-dev libgles2-mesa-dev - -RUN python -m pip install --upgrade pip COPY requirements-dev.txt /tmp -RUN python -m pip install -r /tmp/requirements-dev.txt +RUN python -m pip install --no-cache-dir --upgrade pip && \ + python -m pip install --no-cache-dir -r /tmp/requirements-dev.txt RUN git config --global --add safe.directory /home/pandas ENV SHELL="/bin/bash" diff --git a/MANIFEST.in b/MANIFEST.in index c59151f340545..a7d7d7eb4e062 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -65,5 +65,3 @@ graft pandas/_libs/include # Include cibw script in sdist since it's needed for building wheels include scripts/cibw_before_build.sh -include scripts/cibw_before_build_windows.sh -include scripts/cibw_before_test_windows.sh diff --git a/README.md b/README.md index 1a273fdb896c5..ebab2e6016850 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ----------------- -# pandas: powerful Python data analysis toolkit +# pandas: A Powerful Python Data Analysis Toolkit | | | | --- | --- | diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 30c692115eab1..d286e57ce6b51 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -42,7 +42,7 @@ // followed by the pip installed packages). "matrix": { "pip+build": [], - "Cython": ["3.0"], + "Cython": [], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 6a2ab24df26fe..cd7851acae3f2 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -517,7 +517,7 @@ def setup(self): self.df = DataFrame(np.random.randn(1000, 100)) self.s = Series(np.arange(1028.0)) - self.df2 = DataFrame({i: self.s for i in range(1028)}) + self.df2 = DataFrame(dict.fromkeys(range(1028), self.s)) self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC")) def time_apply_user_func(self): diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 5e3c593e269cb..da0e7de585391 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -67,6 +67,14 @@ class NumericEngineIndexing: def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype + if ( + index_type == "non_monotonic" + and dtype in [np.int16, np.int8, np.uint8] + and unique + ): + # Values overflow + raise NotImplementedError + if index_type == "monotonic_incr": if unique: arr = np.arange(N * 3, dtype=dtype) @@ -115,6 +123,14 @@ def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype dtype = dtype.lower() + if ( + index_type == "non_monotonic" + and dtype in ["int16", "int8", "uint8"] + and unique + ): + # Values overflow + raise NotImplementedError + if index_type == "monotonic_incr": if unique: arr = np.arange(N * 3, dtype=dtype) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2c32eb4f0c584..a0d23aa0478d2 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -72,9 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.Timestamp.max PR02" \ - -i "pandas.Timestamp.min PR02" \ - -i "pandas.Timestamp.resolution PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index c7c72828db481..9f12fe941d488 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer - - cython>=0.29.33 + - cython<4.0.0a0 - meson=1.2.1 - meson-python=0.13.1 @@ -18,46 +18,46 @@ dependencies: - pytest-xdist>=3.4.0 - pytest-localserver>=0.8.1 - pytest-qt>=4.4.0 - - boto3 + - boto3=1.37.3 # required dependencies - python-dateutil=2.8.2 - numpy=1.23.5 # optional dependencies - - beautifulsoup4=4.11.2 - - blosc=1.21.3 + - beautifulsoup4=4.12.3 - bottleneck=1.3.6 - - fastparquet=2023.10.0 - - fsspec=2022.11.0 + - fastparquet=2024.2.0 + - fsspec=2023.12.2 - html5lib=1.1 - hypothesis=6.84.0 - - gcsfs=2022.11.0 - - jinja2=3.1.2 + - gcsfs=2023.12.2 + - jinja2=3.1.3 - lxml=4.9.2 - - matplotlib=3.6.3 - - numba=0.56.4 - - numexpr=2.8.4 + - matplotlib=3.8.3 + - numba=0.59.0 + - numexpr=2.9.0 - odfpy=1.4.1 - qtpy=2.3.0 - - openpyxl=3.1.0 + - openpyxl=3.1.2 - psycopg2=2.9.6 - pyarrow=10.0.1 - - pymysql=1.0.2 + - pyiceberg=0.7.1 + - pymysql=1.1.0 - pyqt=5.15.9 - - pyreadstat=1.2.0 + - pyreadstat=1.2.6 - pytables=3.8.0 - python-calamine=0.1.7 - pytz=2023.4 - pyxlsb=1.0.10 - - s3fs=2022.11.0 - - scipy=1.10.0 + - s3fs=2023.12.2 + - scipy=1.12.0 - sqlalchemy=2.0.0 - tabulate=0.9.0 - - xarray=2022.12.0 + - xarray=2024.1.1 - xlrd=2.0.1 - - xlsxwriter=3.0.5 - - zstandard=0.19.0 + - xlsxwriter=3.2.0 + - zstandard=0.22.0 - pip: - adbc-driver-postgresql==0.10.0 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 74cab4e0970dc..66d49475bf34b 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer - - cython>=0.29.33 + - cython<4.0.0a0 - meson=1.2.1 - meson-python=0.13.1 @@ -16,46 +16,46 @@ dependencies: - pytest-xdist>=3.4.0 - pytest-localserver>=0.8.1 - pytest-qt>=4.4.0 - - boto3 + - boto3=1.37.3 # required dependencies - python-dateutil - numpy # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - - openpyxl>=3.1.0 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 - pyqt>=5.15.9 - - pyreadstat>=1.2.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0, <=2024.9.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 - pip: - adbc-driver-postgresql>=0.10.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 092ca18d61259..100a250f0bf01 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer - - cython>=0.29.33 + - cython<4.0.0a0 - meson=1.2.1 - meson-python=0.13.1 @@ -17,60 +17,58 @@ dependencies: - pytest-xdist>=3.4.0 - pytest-localserver>=0.8.1 - pytest-qt>=4.4.0 - - boto3 + - boto3=1.37.3 # required dependencies - python-dateutil - numpy # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - - openpyxl>=3.1.0 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 - pyqt>=5.15.9 - - pyreadstat>=1.2.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0, <=2024.9.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 # downstream packages - botocore - cftime - dask - ipython - - geopandas-base - seaborn - scikit-learn - statsmodels - coverage - pandas-datareader - pyyaml - - py - pip: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 325a6d45d74fd..99cbe0415b4f9 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer - meson=1.2.1 - meson-python=0.13.1 - - cython>=0.29.33 + - cython<4.0.0a0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 2d3d11c294e12..da0cecda0fb46 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer - meson=1.2.1 - - cython>=0.29.33 + - cython<4.0.0a0 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index b6f515dceaea9..9669c1e29a435 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer - - cython>=0.29.33 + - cython<4.0.0a0 - meson=1.2.1 - meson-python=0.13.1 @@ -16,46 +16,46 @@ dependencies: - pytest-xdist>=3.4.0 - pytest-localserver>=0.8.1 - pytest-qt>=4.4.0 - - boto3 + - boto3=1.37.3 # required dependencies - python-dateutil - numpy # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - pyqt>=5.15.9 - - openpyxl>=3.1.0 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyreadstat>=1.2.0 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0, <=2024.9.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 - pip: - adbc-driver-postgresql>=0.10.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index bc66f8a5382c9..61f1d602bb241 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer - - cython>=0.29.33 + - cython<4.0.0a0 - meson=1.2.1 - meson-python=0.13.1 @@ -16,46 +16,46 @@ dependencies: - pytest-xdist>=3.4.0 - pytest-localserver>=0.8.1 - pytest-qt>=4.4.0 - - boto3 + - boto3=1.37.3 # required dependencies - python-dateutil - numpy # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - pyqt>=5.15.9 - - openpyxl>=3.1.0 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyreadstat>=1.2.0 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0, <=2024.9.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 - pip: - adbc-driver-postgresql>=0.10.0 diff --git a/ci/deps/actions-313-freethreading.yaml b/ci/deps/actions-313-freethreading.yaml new file mode 100644 index 0000000000000..14e3ade976b01 --- /dev/null +++ b/ci/deps/actions-313-freethreading.yaml @@ -0,0 +1,29 @@ +name: pandas-dev-313-freethreading +channels: + - conda-forge +dependencies: + - python-freethreading + + # build dependencies + - setuptools + - versioneer + - cython<4.0.0a0 + - meson=1.8.0 + - meson-python=0.18.0 + + # test dependencies + - pytest>=7.3.2 + - pytest-xdist>=3.4.0 + + # required dependencies + - python-dateutil + - numpy + + # optional dependencies + - hypothesis>=6.84.0 + + - pip: + # No free-threaded coveragepy (with the C-extension) on conda-forge yet + - pytest-cov + - "tzdata>=2022.7" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" diff --git a/ci/deps/actions-313.yaml b/ci/deps/actions-313.yaml new file mode 100644 index 0000000000000..11f4428be27e5 --- /dev/null +++ b/ci/deps/actions-313.yaml @@ -0,0 +1,63 @@ +name: pandas-dev-313 +channels: + - conda-forge +dependencies: + - python=3.13 + + # build dependencies + - versioneer + - cython<4.0.0a0 + - meson=1.2.1 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 + - boto3=1.37.3 + + # required dependencies + - python-dateutil + - numpy + + # optional dependencies + - beautifulsoup4>=4.12.3 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 + - html5lib>=1.1 + - hypothesis>=6.84.0 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 + - lxml>=4.9.2 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 + - odfpy>=1.4.1 + - qtpy>=2.3.0 + - pyqt>=5.15.9 + - openpyxl>=3.1.2 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 + - pymysql>=1.1.0 + - pyreadstat>=1.2.6 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pytz>=2023.4 + - pyxlsb>=1.0.10 + - s3fs>=2023.12.2 + - scipy>=1.12.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2024.1.1 + - xlrd>=2.0.1 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 + + - pip: + - adbc-driver-postgresql>=0.10.0 + - adbc-driver-sqlite>=0.8.0 + - tzdata>=2022.7 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 90933b24b88db..e0ddc6954e4a4 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer - - cython>=0.29.33 + - cython<4.0.0a0 - meson=1.2.1 - meson-python=0.13.1 diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf index 3582e0c0dabf9..33fbf2507ed62 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx index 746f508516964..5ce2e3be48d55 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md index b8599acff2f6e..b72c093b4ba2f 100644 --- a/doc/cheatsheet/README.md +++ b/doc/cheatsheet/README.md @@ -12,7 +12,7 @@ This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](http | Pandas_Cheat_Sheet_JA | Japanese | | | | Pandas_Cheat_Sheet_FA | Persian | | | - +The English version has additional material that is not in the versions in other languages. **Alternative** diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css index b02311eb66080..55141f8955066 100644 --- a/doc/source/_static/css/getting_started.css +++ b/doc/source/_static/css/getting_started.css @@ -249,6 +249,7 @@ ul.task-bullet > li > p:first-child { .tutorial-card .card-header { --bs-card-cap-color: var(--pst-color-text-base); + color: var(--pst-color-text-base); cursor: pointer; background-color: var(--pst-color-surface); border: 1px solid var(--pst-color-border) @@ -256,6 +257,7 @@ ul.task-bullet > li > p:first-child { .tutorial-card .card-body { background-color: var(--pst-color-on-background); + color: var(--pst-color-text-base); } .tutorial-card .badge { diff --git a/doc/source/conf.py b/doc/source/conf.py index 677ee6274b093..f222a228531ff 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -114,6 +114,8 @@ ): exclude_patterns.append(rel_fname) elif single_doc and rel_fname != pattern: + if "\\" in rel_fname: + rel_fname = rel_fname.replace("\\", "/") exclude_patterns.append(rel_fname) with open(os.path.join(source_path, "index.rst.template"), encoding="utf-8") as f: diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst index ab8294b8f135a..1c698d130ea6c 100644 --- a/doc/source/development/community.rst +++ b/doc/source/development/community.rst @@ -77,7 +77,7 @@ Any community member can open issues to: - Ask questions, e.g. "I noticed the behavior of a certain function changed between versions. Is this expected?". - Ideally, your questions should be related to how pandas works rather + - Ideally, your questions should be related to how pandas works rather than how you use pandas. `StackOverflow `_ is better suited for answering usage questions, and we ask that all usage questions are first asked on StackOverflow. Thank you for respecting our diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 4d99f282aa695..66178a88e3e31 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -234,14 +234,14 @@ and merged into project to appear the in the next release. To submit a pull requ #. Write a descriptive title that includes prefixes. pandas uses a convention for title prefixes. Here are some common ones along with general guidelines for when to use them: - * ENH: Enhancement, new functionality - * BUG: Bug fix - * DOC: Additions/updates to documentation - * TST: Additions/updates to tests - * BLD: Updates to the build process/scripts - * PERF: Performance improvement - * TYP: Type annotations - * CLN: Code cleanup + * ENH: Enhancement, new functionality + * BUG: Bug fix + * DOC: Additions/updates to documentation + * TST: Additions/updates to tests + * BLD: Updates to the build process/scripts + * PERF: Performance improvement + * TYP: Type annotations + * CLN: Code cleanup #. Write a description of your changes in the ``Preview Discussion`` tab #. Click ``Send Pull Request``. diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index b8d568428c156..73bc756de9302 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -444,11 +444,11 @@ be located. result = ser.loc[[3, 4]] tm.assert_series_equal(result, expected) - In cases like this, the test location should be based on the *underlying* - method being tested. Or in the case of a test for a bugfix, the location - of the actual bug. So in this example, we know that ``Series.__getitem__`` - calls ``Series.loc.__getitem__``, so this is *really* a test for - ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``. + In cases like this, the test location should be based on the *underlying* + method being tested. Or in the case of a test for a bugfix, the location + of the actual bug. So in this example, we know that ``Series.__getitem__`` + calls ``Series.loc.__getitem__``, so this is *really* a test for + ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``. 6. Is your test for a DataFrame or Series method? diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 98bd4b00d016b..d7b779debcd5e 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -251,7 +251,7 @@ This option allows you to configure where meson stores your built C extensions, Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions. Appending ``-Csetup-args="-Ddebug=true"`` will do the trick. -With pip, it is possible to chain together multiple config settings (for example specifying both a build directory +With pip, it is possible to chain together multiple config settings. For example, specifying both a build directory and building with debug symbols would look like ``-Cbuilddir="your builddir here" -Csetup-args="-Dbuildtype=debug"``. diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst index b70981b4d307d..447b7b20a8ae5 100644 --- a/doc/source/development/contributing_gitpod.rst +++ b/doc/source/development/contributing_gitpod.rst @@ -158,8 +158,8 @@ Option 1: using Liveserve file and click on **Open with Live Serve**. Alternatively, you can open the file in the editor and click on the **Go live** button on the status bar. - .. image:: ./gitpod-imgs/vscode-statusbar.png - :alt: Gitpod workspace VSCode start live serve screenshot + .. image:: ./gitpod-imgs/vscode-statusbar.png + :alt: Gitpod workspace VSCode start live serve screenshot #. A simple browser will open to the right-hand side of the editor. We recommend closing it and click on the **Open in browser** button in the pop-up. @@ -182,13 +182,13 @@ uses the rst extension with docutils. :kbd:`Ctrl-Shift-P` in Linux and Windows. Start typing "restructured" and choose either "Open preview" or "Open preview to the Side". - .. image:: ./gitpod-imgs/vscode-rst.png - :alt: Gitpod workspace VSCode open rst screenshot + .. image:: ./gitpod-imgs/vscode-rst.png + :alt: Gitpod workspace VSCode open rst screenshot #. As you work on the document, you will see a live rendering of it on the editor. - .. image:: ./gitpod-imgs/rst-rendering.png - :alt: Gitpod workspace VSCode rst rendering screenshot + .. image:: ./gitpod-imgs/rst-rendering.png + :alt: Gitpod workspace VSCode rst rendering screenshot If you want to see the final output with the ``html`` theme you will need to rebuild the docs with ``make html`` and use Live Serve as described in option 1. diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index 0ea1c112cb55b..c8127e0cc2996 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -10,9 +10,9 @@ pandas uses Cython and C/C++ `extension modules `_ - 2. `Fundamental Python Debugging Part 2 - Python Extensions `_ - 3. `Fundamental Python Debugging Part 3 - Cython Extensions `_ +1. `Fundamental Python Debugging Part 1 - Python `_ +2. `Fundamental Python Debugging Part 2 - Python Extensions `_ +3. `Fundamental Python Debugging Part 3 - Cython Extensions `_ Debugging locally ----------------- @@ -23,7 +23,7 @@ By default building pandas from source will generate a release build. To generat .. note:: - conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases. If using conda, you may need to set ``CFLAGS="$CFLAGS -O0"`` and ``CPPFLAGS="$CPPFLAGS -O0"`` to ensure optimizations are turned off for debugging + conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases, and may work counter towards usage in a development environment. If using conda, you should unset these environment variables via ``export CFLAGS=`` and ``export CPPFLAGS=`` By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index c5c4b7c449ce7..21a840fbe9a5f 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -116,19 +116,19 @@ The ``metadata`` field is ``None`` except for: omitted it is assumed to be nanoseconds. * ``categorical``: ``{'num_categories': K, 'ordered': is_ordered, 'type': $TYPE}`` - * Here ``'type'`` is optional, and can be a nested pandas type specification - here (but not categorical) + * Here ``'type'`` is optional, and can be a nested pandas type specification + here (but not categorical) * ``unicode``: ``{'encoding': encoding}`` - * The encoding is optional, and if not present is UTF-8 + * The encoding is optional, and if not present is UTF-8 * ``object``: ``{'encoding': encoding}``. Objects can be serialized and stored in ``BYTE_ARRAY`` Parquet columns. The encoding can be one of: - * ``'pickle'`` - * ``'bson'`` - * ``'json'`` + * ``'pickle'`` + * ``'bson'`` + * ``'json'`` * ``timedelta``: ``{'unit': 'ns'}``. The ``'unit'`` is optional, and if omitted it is assumed to be nanoseconds. This metadata is optional altogether diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index c572559dcc3e0..c37925f7e271a 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -218,11 +218,11 @@ pandas supports point releases (e.g. ``1.4.3``) that aim to: 1. Fix bugs in new features introduced in the first minor version release. - * e.g. If a new feature was added in ``1.4`` and contains a bug, a fix can be applied in ``1.4.3`` + * e.g. If a new feature was added in ``1.4`` and contains a bug, a fix can be applied in ``1.4.3`` 2. Fix bugs that used to work in a few minor releases prior. There should be agreement between core team members that a backport is appropriate. - * e.g. If a feature worked in ``1.2`` and stopped working since ``1.3``, a fix can be applied in ``1.4.3``. + * e.g. If a feature worked in ``1.2`` and stopped working since ``1.3``, a fix can be applied in ``1.4.3``. Since pandas minor releases are based on GitHub branches (e.g. point release of ``1.4`` are based off the ``1.4.x`` branch), "backporting" means merging a pull request fix to the ``main`` branch and correct minor branch associated with the next point release. @@ -289,8 +289,8 @@ The required steps for adding a maintainer are: 1. Contact the contributor and ask their interest to join. 2. Add the contributor to the appropriate `GitHub Team `_ if accepted the invitation. - * ``pandas-core`` is for core team members - * ``pandas-triage`` is for pandas triage members + * ``pandas-core`` is for core team members + * ``pandas-triage`` is for pandas triage members If adding to ``pandas-core``, there are two additional steps: @@ -467,10 +467,10 @@ Post-Release patch releases. The exact instructions are (replace the example version numbers by the appropriate ones for the version you are releasing): - - Log in to the server and use the correct user. - - ``cd /var/www/html/pandas-docs/`` - - ``ln -sfn version/2.1 stable`` (for a major or minor release) - - ``ln -sfn version/2.0.3 version/2.0`` (for a patch release) + - Log in to the server and use the correct user. + - ``cd /var/www/html/pandas-docs/`` + - ``ln -sfn version/2.1 stable`` (for a major or minor release) + - ``ln -sfn version/2.0.3 version/2.0`` (for a patch release) 2. If releasing a major or minor release, open a PR in our source code to update ``web/pandas/versions.json``, to have the desired versions in the documentation @@ -487,8 +487,8 @@ Post-Release 6. Announce the new release in the official channels (use previous announcements for reference): - - The pandas-dev and pydata mailing lists - - X, Mastodon, Telegram and LinkedIn + - The pandas-dev and pydata mailing lists + - X, Mastodon, Telegram and LinkedIn 7. Update this release instructions to fix anything incorrect and to update about any change since the last release. diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index d9d7d916b0238..cc7add87b5935 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -383,7 +383,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python - a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) + a = np.array(list(range(1, 24)) + [np.nan]).reshape(2, 3, 4) pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) meltlist @@ -402,7 +402,7 @@ In Python, this list would be a list of tuples, so .. ipython:: python - a = list(enumerate(list(range(1, 5)) + [np.NAN])) + a = list(enumerate(list(range(1, 5)) + [np.nan])) pd.DataFrame(a) For more details and examples see :ref:`the Intro to Data Structures diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index bda959f380e8a..1589fea5f8953 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -183,9 +183,9 @@ Installable with ``pip install "pandas[performance]"`` ===================================================== ================== ================== =================================================================================================================================================================================== Dependency Minimum Version pip extra Notes ===================================================== ================== ================== =================================================================================================================================================================================== -`numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups +`numexpr `__ 2.9.0 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups `bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. -`numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. +`numba `__ 0.59.0 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. ===================================================== ================== ================== =================================================================================================================================================================================== Visualization @@ -196,8 +196,8 @@ Installable with ``pip install "pandas[plot, output-formatting]"``. ========================================================== ================== ================== ======================================================= Dependency Minimum Version pip extra Notes ========================================================== ================== ================== ======================================================= -`matplotlib `__ 3.6.3 plot Plotting library -`Jinja2 `__ 3.1.2 output-formatting Conditional formatting with DataFrame.style +`matplotlib `__ 3.8.3 plot Plotting library +`Jinja2 `__ 3.1.3 output-formatting Conditional formatting with DataFrame.style `tabulate `__ 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) ========================================================== ================== ================== ======================================================= @@ -209,8 +209,8 @@ Installable with ``pip install "pandas[computation]"``. ============================================== ================== =============== ======================================= Dependency Minimum Version pip extra Notes ============================================== ================== =============== ======================================= -`SciPy `__ 1.10.0 computation Miscellaneous statistical functions -`xarray `__ 2022.12.0 computation pandas-like API for N-dimensional data +`SciPy `__ 1.12.0 computation Miscellaneous statistical functions +`xarray `__ 2024.1.1 computation pandas-like API for N-dimensional data ============================================== ================== =============== ======================================= .. _install.excel_dependencies: @@ -224,8 +224,8 @@ Installable with ``pip install "pandas[excel]"``. Dependency Minimum Version pip extra Notes ================================================================== ================== =============== ============================================================= `xlrd `__ 2.0.1 excel Reading for xls files -`xlsxwriter `__ 3.0.5 excel Writing for xlsx files -`openpyxl `__ 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files +`xlsxwriter `__ 3.2.0 excel Writing for xlsx files +`openpyxl `__ 3.1.2 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files `pyxlsb `__ 1.0.10 excel Reading for xlsb files `python-calamine `__ 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files `odfpy `__ 1.4.1 excel Reading / writing for OpenDocument 1.2 files @@ -239,7 +239,7 @@ Installable with ``pip install "pandas[html]"``. =============================================================== ================== =============== ========================== Dependency Minimum Version pip extra Notes =============================================================== ================== =============== ========================== -`BeautifulSoup4 `__ 4.11.2 html HTML parser for read_html +`BeautifulSoup4 `__ 4.12.3 html HTML parser for read_html `html5lib `__ 1.1 html HTML parser for read_html `lxml `__ 4.9.2 html HTML parser for read_html =============================================================== ================== =============== ========================== @@ -291,7 +291,7 @@ Dependency Minimum Versi mysql, sql-other `psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy -`pymysql `__ 1.0.2 mysql MySQL engine for sqlalchemy +`pymysql `__ 1.1.0 mysql MySQL engine for sqlalchemy `adbc-driver-postgresql `__ 0.10.0 postgresql ADBC Driver for PostgreSQL `adbc-driver-sqlite `__ 0.8.0 sql-other ADBC Driver for SQLite ================================================================== ================== =============== ============================================ @@ -299,17 +299,17 @@ Dependency Minimum Versi Other data sources ^^^^^^^^^^^^^^^^^^ -Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"`` +Installable with ``pip install "pandas[hdf5, parquet, iceberg, feather, spss, excel]"`` ====================================================== ================== ================ ========================================================== Dependency Minimum Version pip extra Notes ====================================================== ================== ================ ========================================================== `PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing -`blosc `__ 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` `zlib `__ hdf5 Compression for HDF5 -`fastparquet `__ 2023.10.0 - Parquet reading / writing (pyarrow is default) +`fastparquet `__ 2024.2.0 - Parquet reading / writing (pyarrow is default) `pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing -`pyreadstat `__ 1.2.0 spss SPSS files (.sav) reading +`PyIceberg `__ 0.7.1 iceberg Apache Iceberg reading / writing +`pyreadstat `__ 1.2.6 spss SPSS files (.sav) reading `odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing ====================================================== ================== ================ ========================================================== @@ -329,10 +329,10 @@ Installable with ``pip install "pandas[fss, aws, gcp]"`` ============================================ ================== =============== ========================================================== Dependency Minimum Version pip extra Notes ============================================ ================== =============== ========================================================== -`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required +`fsspec `__ 2023.12.2 fss, gcp, aws Handling files aside from simple local and HTTP (required dependency of s3fs, gcsfs). -`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access -`s3fs `__ 2022.11.0 aws Amazon S3 access +`gcsfs `__ 2023.12.2 gcp Google Cloud Storage access +`s3fs `__ 2023.12.2 aws Amazon S3 access ============================================ ================== =============== ========================================================== Clipboard diff --git a/doc/source/getting_started/intro_tutorials/includes/titanic.rst b/doc/source/getting_started/intro_tutorials/includes/titanic.rst index 6e03b848aab06..41159516200fa 100644 --- a/doc/source/getting_started/intro_tutorials/includes/titanic.rst +++ b/doc/source/getting_started/intro_tutorials/includes/titanic.rst @@ -11,7 +11,7 @@ This tutorial uses the Titanic data set, stored as CSV. The data consists of the following data columns: - PassengerId: Id of every passenger. -- Survived: Indication whether passenger survived. ``0`` for yes and ``1`` for no. +- Survived: Indication whether passenger survived. ``0`` for no and ``1`` for yes. - Pclass: One out of the 3 ticket classes: Class ``1``, Class ``2`` and Class ``3``. - Name: Name of passenger. - Sex: Gender of passenger. diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 5be08f163e6ce..d37eebef5c0c0 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -664,6 +664,7 @@ Data type introspection api.types.is_datetime64_dtype api.types.is_datetime64_ns_dtype api.types.is_datetime64tz_dtype + api.types.is_dtype_equal api.types.is_extension_array_dtype api.types.is_float_dtype api.types.is_int64_dtype diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index fc180c8161a7e..004651ac0074f 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -79,6 +79,8 @@ Function application DataFrameGroupBy.cumsum DataFrameGroupBy.describe DataFrameGroupBy.diff + DataFrameGroupBy.ewm + DataFrameGroupBy.expanding DataFrameGroupBy.ffill DataFrameGroupBy.first DataFrameGroupBy.head @@ -130,6 +132,8 @@ Function application SeriesGroupBy.cumsum SeriesGroupBy.describe SeriesGroupBy.diff + SeriesGroupBy.ewm + SeriesGroupBy.expanding SeriesGroupBy.ffill SeriesGroupBy.first SeriesGroupBy.head diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 805fb8b783459..37d9e7f6b7dbd 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -156,6 +156,16 @@ Parquet read_parquet DataFrame.to_parquet +Iceberg +~~~~~~~ +.. autosummary:: + :toctree: api/ + + read_iceberg + DataFrame.to_iceberg + +.. warning:: ``read_iceberg`` is experimental and may change without warning. + ORC ~~~ .. autosummary:: diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 72bb93d21a99f..8beaa73090673 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -178,12 +178,26 @@ Getitem (``[]``) ~~~~~~~~~~~~~~~~ For a :class:`DataFrame`, passing a single label selects a column and -yields a :class:`Series` equivalent to ``df.A``: +yields a :class:`Series`: .. ipython:: python df["A"] +If the label only contains letters, numbers, and underscores, you can +alternatively use the column name attribute: + +.. ipython:: python + + df.A + +Passing a list of column labels selects multiple columns, which can be useful +for getting a subset/rearranging: + +.. ipython:: python + + df[["B", "A"]] + For a :class:`DataFrame`, passing a slice ``:`` selects matching rows: .. ipython:: python diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 14af5d9dc22c8..8155aa0ae03fa 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2064,12 +2064,12 @@ different numeric dtypes will **NOT** be combined. The following example will gi .. ipython:: python - df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32") + df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float64") df1 df1.dtypes df2 = pd.DataFrame( { - "A": pd.Series(np.random.randn(8), dtype="float16"), + "A": pd.Series(np.random.randn(8), dtype="float32"), "B": pd.Series(np.random.randn(8)), "C": pd.Series(np.random.randint(0, 255, size=8), dtype="uint8"), # [0,255] (range of uint8) } diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index e55a6cda47ac2..9c37f317a805e 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -50,7 +50,7 @@ We have a :class:`DataFrame` to which we want to apply a function row-wise. { "a": np.random.randn(1000), "b": np.random.randn(1000), - "N": np.random.randint(100, 1000, (1000)), + "N": np.random.randint(100, 1000, (1000), dtype="int64"), "x": "x", } ) @@ -83,7 +83,7 @@ using the `prun ipython magic function `. * A boolean array. @@ -1461,16 +1461,33 @@ Looking up values by index/column labels Sometimes you want to extract a set of values given a sequence of row labels and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing. -For instance: -.. ipython:: python +For heterogeneous column types, we subset columns to avoid unnecessary NumPy conversions: + +.. code-block:: python + + def pd_lookup_het(df, row_labels, col_labels): + rows = df.index.get_indexer(row_labels) + cols = df.columns.get_indexer(col_labels) + sub = df.take(np.unique(cols), axis=1) + sub = sub.take(np.unique(rows), axis=0) + rows = sub.index.get_indexer(row_labels) + values = sub.melt()["value"] + cols = sub.columns.get_indexer(col_labels) + flat_index = rows + cols * len(sub) + result = values[flat_index] + return result + +For homogeneous column types, it is fastest to skip column subsetting and go directly to NumPy: + +.. code-block:: python - df = pd.DataFrame({'col': ["A", "A", "B", "B"], - 'A': [80, 23, np.nan, 22], - 'B': [80, 55, 76, 67]}) - df - idx, cols = pd.factorize(df['col']) - df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] + def pd_lookup_hom(df, row_labels, col_labels): + rows = df.index.get_indexer(row_labels) + df = df.loc[:, sorted(set(col_labels))] + cols = df.columns.get_indexer(col_labels) + result = df.to_numpy()[rows, cols] + return result Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method which was deprecated in version 1.2.0 and removed in version 2.0.0. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 23da52f26358f..25f1e11e6b603 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -26,9 +26,10 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text, Local clipboard, :ref:`read_clipboard`, :ref:`to_clipboard` binary,`MS Excel `__ , :ref:`read_excel`, :ref:`to_excel` binary,`OpenDocument `__, :ref:`read_excel`, NA - binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf` + binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf` binary,`Feather Format `__, :ref:`read_feather`, :ref:`to_feather` binary,`Parquet Format `__, :ref:`read_parquet`, :ref:`to_parquet` + binary,`Apache Iceberg `__, :ref:`read_iceberg` , :ref:`to_iceberg` binary,`ORC Format `__, :ref:`read_orc`, :ref:`to_orc` binary,`Stata `__, :ref:`read_stata`, :ref:`to_stata` binary,`SAS `__, :ref:`read_sas` , NA @@ -1414,7 +1415,7 @@ of multi-columns indices. .. note:: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it - with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will + with ``df.to_csv(..., index=False)``), then any ``names`` on the columns index will be *lost*. .. ipython:: python @@ -5403,6 +5404,125 @@ The above example creates a partitioned dataset that may look like: except OSError: pass +.. _io.iceberg: + +Iceberg +------- + +.. versionadded:: 3.0.0 + +Apache Iceberg is a high performance open-source format for large analytic tables. +Iceberg enables the use of SQL tables for big data while making it possible for different +engines to safely work with the same tables at the same time. + +Iceberg support predicate pushdown and column pruning, which are available to pandas +users via the ``row_filter`` and ``selected_fields`` parameters of the :func:`~pandas.read_iceberg` +function. This is convenient to extract from large tables a subset that fits in memory as a +pandas ``DataFrame``. + +Internally, pandas uses PyIceberg_ to query Iceberg. + +.. _PyIceberg: https://py.iceberg.apache.org/ + +A simple example loading all data from an Iceberg table ``my_table`` defined in the +``my_catalog`` catalog. + +.. code-block:: python + + df = pd.read_iceberg("my_table", catalog_name="my_catalog") + +Catalogs must be defined in the ``.pyiceberg.yaml`` file, usually in the home directory. +It is possible to to change properties of the catalog definition with the +``catalog_properties`` parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + catalog_properties={"s3.secret-access-key": "my_secret"}, + ) + +It is also possible to fully specify the catalog in ``catalog_properties`` and not provide +a ``catalog_name``: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_properties={ + "uri": "http://127.0.0.1:8181", + "s3.endpoint": "http://127.0.0.1:9000", + }, + ) + +To create the ``DataFrame`` with only a subset of the columns: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + selected_fields=["my_column_3", "my_column_7"] + ) + +This will execute the function faster, since other columns won't be read. And it will also +save memory, since the data from other columns won't be loaded into the underlying memory of +the ``DataFrame``. + +To fetch only a subset of the rows, we can do it with the ``limit`` parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + limit=100, + ) + +This will create a ``DataFrame`` with 100 rows, assuming there are at least this number in +the table. + +To fetch a subset of the rows based on a condition, this can be done using the ``row_filter`` +parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + row_filter="distance > 10.0", + ) + +Reading a particular snapshot is also possible providing the snapshot ID as an argument to +``snapshot_id``. + +To save a ``DataFrame`` to Iceberg, it can be done with the :meth:`DataFrame.to_iceberg` +method: + +.. code-block:: python + + df.to_iceberg("my_table", catalog_name="my_catalog") + +To specify the catalog, it works in the same way as for :func:`read_iceberg` with the +``catalog_name`` and ``catalog_properties`` parameters. + +The location of the table can be specified with the ``location`` parameter: + +.. code-block:: python + + df.to_iceberg( + "my_table", + catalog_name="my_catalog", + location="s://my-data-lake/my-iceberg-tables", + ) + +It is possible to add properties to the table snapshot by passing a dictionary to the +``snapshot_properties`` parameter. + +More information about the Iceberg format can be found in the `Apache Iceberg official +page `__. + .. _io.orc: ORC diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 60a66f5e6f2a8..af377dd7a32f2 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -107,7 +107,7 @@ Joining logic of the resulting axis The ``join`` keyword specifies how to handle axis values that don't exist in the first :class:`DataFrame`. -``join='outer'`` takes the union of all axis values +``join='outer'`` takes the union of all axis values. .. ipython:: python @@ -130,7 +130,7 @@ The ``join`` keyword specifies how to handle axis values that don't exist in the p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); plt.close("all"); -``join='inner'`` takes the intersection of the axis values +``join='inner'`` takes the intersection of the axis values. .. ipython:: python @@ -296,7 +296,7 @@ the index of the :class:`DataFrame` pieces: result.index.levels -``levels`` argument allows specifying resulting levels associated with the ``keys`` +``levels`` argument allows specifying resulting levels associated with the ``keys``. .. ipython:: python @@ -322,7 +322,7 @@ Appending rows to a :class:`DataFrame` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you have a :class:`Series` that you want to append as a single row to a :class:`DataFrame`, you can convert the row into a -:class:`DataFrame` and use :func:`concat` +:class:`DataFrame` and use :func:`concat`. .. ipython:: python @@ -355,7 +355,7 @@ Merge types their indexes which must contain unique values. * **many-to-one**: joining a unique index to one or more columns in a different :class:`DataFrame`. -* **many-to-many** : joining columns on columns. +* **many-to-many**: joining columns on columns. .. note:: @@ -485,8 +485,9 @@ either the left or right tables, the values in the joined table will be plt.close("all"); You can merge :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of -the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. Transform -the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` before merging +the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. You can also +transform the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` +before merging: .. ipython:: python @@ -504,7 +505,7 @@ the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` bef pd.merge(df, ser.reset_index(), on=["Let", "Num"]) -Performing an outer join with duplicate join keys in :class:`DataFrame` +Performing an outer join with duplicate join keys in :class:`DataFrame`: .. ipython:: python @@ -956,7 +957,7 @@ location. :func:`merge_ordered` --------------------- -:func:`merge_ordered` combines order data such as numeric or time series data +:func:`merge_ordered` combines ordered data such as numeric or time series data with optional filling of missing data with ``fill_method``. .. ipython:: python @@ -1082,7 +1083,7 @@ Stack the differences on rows. df.compare(df2, align_axis=0) -Keep all original rows and columns with ``keep_shape=True`` +Keep all original rows and columns with ``keep_shape=True``. .. ipython:: python diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index e15939eb49239..56f4c80cbde16 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -258,9 +258,6 @@ will convert your data to use the nullable data types supporting :class:`NA`, such as :class:`Int64Dtype` or :class:`ArrowDtype`. This is especially helpful after reading in data sets from IO methods where data types were inferred. -In this example, while the dtypes of all columns are changed, we show the results for -the first 10 columns. - .. ipython:: python import io @@ -434,7 +431,7 @@ where the index and column aligns between the original object and the filled obj .. note:: - :meth:`DataFrame.where` can also be used to fill NA values.Same result as above. + :meth:`DataFrame.where` can also be used to fill NA values. Same result as above. .. ipython:: python diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 8c5e98791a9ef..bc5a2d5ed5735 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -395,7 +395,7 @@ variables and the values representing the presence of those variables per row. pd.get_dummies(df["key"]) df["key"].str.get_dummies() -``prefix`` adds a prefix to the the column names which is useful for merging the result +``prefix`` adds a prefix to the column names which is useful for merging the result with the original :class:`DataFrame`: .. ipython:: python diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 25bcb8bcc0c93..624086f7a8505 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -40,8 +40,8 @@ and in the Python interpreter. .. ipython:: python - 'dense : {:0.2f} bytes'.format(df.memory_usage().sum() / 1e3) - 'sparse: {:0.2f} bytes'.format(sdf.memory_usage().sum() / 1e3) + f'dense: {df.memory_usage().sum()} bytes' + f'sparse: {sdf.memory_usage().sum()} bytes' Functionally, their behavior should be nearly identical to their dense counterparts. diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 10260cb011d90..ac0fc9e53ee94 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2458,7 +2458,7 @@ you can use the ``tz_convert`` method. For ``pytz`` time zones, it is incorrect to pass a time zone object directly into the ``datetime.datetime`` constructor - (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``. + (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``). Instead, the datetime needs to be localized using the ``localize`` method on the ``pytz`` time zone object. diff --git a/doc/source/user_guide/user_defined_functions.rst b/doc/source/user_guide/user_defined_functions.rst new file mode 100644 index 0000000000000..6f7fdaddac622 --- /dev/null +++ b/doc/source/user_guide/user_defined_functions.rst @@ -0,0 +1,419 @@ +.. _udf: + +{{ header }} + +***************************** +User-Defined Functions (UDFs) +***************************** + +In pandas, User-Defined Functions (UDFs) provide a way to extend the library’s +functionality by allowing users to apply custom computations to their data. While +pandas comes with a set of built-in functions for data manipulation, UDFs offer +flexibility when built-in methods are not sufficient. These functions can be +applied at different levels: element-wise, row-wise, column-wise, or group-wise, +and behave differently, depending on the method used. + +Here’s a simple example to illustrate a UDF applied to a Series: + +.. ipython:: python + + s = pd.Series([1, 2, 3]) + + # Simple UDF that adds 1 to a value + def add_one(x): + return x + 1 + + # Apply the function element-wise using .map + s.map(add_one) + +Why Not To Use User-Defined Functions +------------------------------------- + +While UDFs provide flexibility, they come with significant drawbacks, primarily +related to performance and behavior. When using UDFs, pandas must perform inference +on the result, and that inference could be incorrect. Furthermore, unlike vectorized operations, +UDFs are slower because pandas can't optimize their computations, leading to +inefficient processing. + +.. note:: + In general, most tasks can and should be accomplished using pandas’ built-in methods or vectorized operations. + +Despite their drawbacks, UDFs can be helpful when: + +* **Custom Computations Are Needed**: Implementing complex logic or domain-specific calculations that pandas' + built-in methods cannot handle. +* **Extending pandas' Functionality**: Applying external libraries or specialized algorithms unavailable in pandas. +* **Handling Complex Grouped Operations**: Performing operations on grouped data that standard methods do not support. + +For example: + +.. code-block:: python + + from sklearn.linear_model import LinearRegression + + # Sample data + df = pd.DataFrame({ + 'group': ['A', 'A', 'A', 'B', 'B', 'B'], + 'x': [1, 2, 3, 1, 2, 3], + 'y': [2, 4, 6, 1, 2, 1.5] + }) + + # Function to fit a model to each group + def fit_model(group): + model = LinearRegression() + model.fit(group[['x']], group['y']) + group['y_pred'] = model.predict(group[['x']]) + return group + + result = df.groupby('group').apply(fit_model) + + +Methods that support User-Defined Functions +------------------------------------------- + +User-Defined Functions can be applied across various pandas methods: + ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| Method | Function Input | Function Output | Description | ++===============================+========================+==========================+==============================================================================================================================================+ +| :ref:`udf.map` | Scalar | Scalar | Apply a function to each element | ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :ref:`udf.apply` (axis=0) | Column (Series) | Column (Series) | Apply a function to each column | ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :ref:`udf.apply` (axis=1) | Row (Series) | Row (Series) | Apply a function to each row | ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :ref:`udf.pipe` | Series or DataFrame | Series or DataFrame | Chain functions together to apply to Series or Dataframe | ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :ref:`udf.filter` | Series or DataFrame | Boolean | Only accepts UDFs in group by. Function is called for each group, and the group is removed from the result if the function returns ``False`` | ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :ref:`udf.agg` | Series or DataFrame | Scalar or Series | Aggregate and summarizes values, e.g., sum or custom reducer | ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :ref:`udf.transform` (axis=0) | Column (Series) | Column (Series) | Same as :meth:`apply` with (axis=0), but it raises an exception if the function changes the shape of the data | ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :ref:`udf.transform` (axis=1) | Row (Series) | Row (Series) | Same as :meth:`apply` with (axis=1), but it raises an exception if the function changes the shape of the data | ++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ + +When applying UDFs in pandas, it is essential to select the appropriate method based +on your specific task. Each method has its strengths and is designed for different use +cases. Understanding the purpose and behavior of each method will help you make informed +decisions, ensuring more efficient and maintainable code. + +.. note:: + Some of these methods are can also be applied to groupby, resample, and various window objects. + See :ref:`groupby`, :ref:`resample()`, :ref:`rolling()`, :ref:`expanding()`, + and :ref:`ewm()` for details. + + +.. _udf.map: + +:meth:`Series.map` and :meth:`DataFrame.map` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :meth:`map` method is used specifically to apply element-wise UDFs. This means the function +will be called for each element in the ``Series`` or ``DataFrame``, with the individual value or +the cell as the function argument. + +.. ipython:: python + + temperature_celsius = pd.DataFrame({ + "NYC": [14, 21, 23], + "Los Angeles": [22, 28, 31], + }) + + def to_fahrenheit(value): + return value * (9 / 5) + 32 + + temperature_celsius.map(to_fahrenheit) + +In this example, the function ``to_fahrenheit`` will be called 6 times, once for each value +in the ``DataFrame``. And the result of each call will be returned in the corresponding cell +of the resulting ``DataFrame``. + +In general, ``map`` will be slow, as it will not make use of vectorization. Instead, a Python +function call for each value will be required, which will slow down things significantly if +working with medium or large data. + +When to use: Use :meth:`map` for applying element-wise UDFs to DataFrames or Series. + +.. _udf.apply: + +:meth:`Series.apply` and :meth:`DataFrame.apply` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :meth:`apply` method allows you to apply UDFs for a whole column or row. This is different +from :meth:`map` in that the function will be called for each column (or row), not for each individual value. + +.. ipython:: python + + temperature_celsius = pd.DataFrame({ + "NYC": [14, 21, 23], + "Los Angeles": [22, 28, 31], + }) + + def to_fahrenheit(column): + return column * (9 / 5) + 32 + + temperature_celsius.apply(to_fahrenheit) + +In the example, ``to_fahrenheit`` will be called only twice, as opposed to the 6 times with :meth:`map`. +This will be faster than using :meth:`map`, since the operations for each column are vectorized, and the +overhead of iterating over data in Python and calling Python functions is significantly reduced. + +In some cases, the function may require all the data to be able to compute the result. So :meth:`apply` +is needed, since with :meth:`map` the function can only access one element at a time. + +.. ipython:: python + + temperature = pd.DataFrame({ + "NYC": [14, 21, 23], + "Los Angeles": [22, 28, 31], + }) + + def normalize(column): + return column / column.mean() + + temperature.apply(normalize) + +In the example, the ``normalize`` function needs to compute the mean of the whole column in order +to divide each element by it. So, we cannot call the function for each element, but we need the +function to receive the whole column. + +:meth:`apply` can also execute function by row, by specifying ``axis=1``. + +.. ipython:: python + + temperature = pd.DataFrame({ + "NYC": [14, 21, 23], + "Los Angeles": [22, 28, 31], + }) + + def hotter(row): + return row["Los Angeles"] - row["NYC"] + + temperature.apply(hotter, axis=1) + +In the example, the function ``hotter`` will be called 3 times, once for each row. And each +call will receive the whole row as the argument, allowing computations that require more than +one value in the row. + +``apply`` is also available for :meth:`SeriesGroupBy.apply`, :meth:`DataFrameGroupBy.apply`, +:meth:`Rolling.apply`, :meth:`Expanding.apply` and :meth:`Resampler.apply`. You can read more +about ``apply`` in groupby operations :ref:`groupby.apply`. + +When to use: :meth:`apply` is suitable when no alternative vectorized method or UDF method is available, +but consider optimizing performance with vectorized operations wherever possible. + +.. _udf.pipe: + +:meth:`Series.pipe` and :meth:`DataFrame.pipe` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``pipe`` method is similar to ``map`` and ``apply``, but the function receives the whole ``Series`` +or ``DataFrame`` it is called on. + +.. ipython:: python + + temperature = pd.DataFrame({ + "NYC": [14, 21, 23], + "Los Angeles": [22, 28, 31], + }) + + def normalize(df): + return df / df.mean().mean() + + temperature.pipe(normalize) + +This is equivalent to calling the ``normalize`` function with the ``DataFrame`` as the parameter. + +.. ipython:: python + + normalize(temperature) + +The main advantage of using ``pipe`` is readability. It allows method chaining and clearer code when +calling multiple functions. + +.. ipython:: python + + temperature_celsius = pd.DataFrame({ + "NYC": [14, 21, 23], + "Los Angeles": [22, 28, 31], + }) + + def multiply_by_9(value): + return value * 9 + + def divide_by_5(value): + return value / 5 + + def add_32(value): + return value + 32 + + # Without `pipe`: + fahrenheit = add_32(divide_by_5(multiply_by_9(temperature_celsius))) + + # With `pipe`: + fahrenheit = (temperature_celsius.pipe(multiply_by_9) + .pipe(divide_by_5) + .pipe(add_32)) + +``pipe`` is also available for :meth:`SeriesGroupBy.pipe`, :meth:`DataFrameGroupBy.pipe` and +:meth:`Resampler.pipe`. You can read more about ``pipe`` in groupby operations in :ref:`groupby.pipe`. + +When to use: Use :meth:`pipe` when you need to create a pipeline of operations and want to keep the code readable and maintainable. + +.. _udf.filter: + +:meth:`Series.filter` and :meth:`DataFrame.filter` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``filter`` method is used to select a subset of rows that match certain criteria. +:meth:`Series.filter` and :meth:`DataFrame.filter` do not support user defined functions, +but :meth:`SeriesGroupBy.filter` and :meth:`DataFrameGroupBy.filter` do. You can read more +about ``filter`` in groupby operations in :ref:`groupby.filter`. + +.. _udf.agg: + +:meth:`Series.agg` and :meth:`DataFrame.agg` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``agg`` method is used to aggregate a set of data points into a single one. +The most common aggregation functions such as ``min``, ``max``, ``mean``, ``sum``, etc. +are already implemented in pandas. ``agg`` allows to implement other custom aggregate +functions. + +.. ipython:: python + + temperature = pd.DataFrame({ + "NYC": [14, 21, 23], + "Los Angeles": [22, 28, 31], + }) + + def highest_jump(column): + return column.pct_change().max() + + temperature.agg(highest_jump) + + +When to use: Use :meth:`agg` for performing custom aggregations, where the operation returns +a scalar value on each input. + +.. _udf.transform: + +:meth:`Series.transform` and :meth:`DataFrame.transform` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``transform``` method is similar to an aggregation, with the difference that the result is broadcasted +to the original data. + +.. ipython:: python + + temperature = pd.DataFrame({ + "NYC": [14, 21, 23], + "Los Angeles": [22, 28, 31]}, + index=pd.date_range("2000-01-01", "2000-01-03")) + + def warm_up_all_days(column): + return pd.Series(column.max(), index=column.index) + + temperature.transform(warm_up_all_days) + +In the example, the ``warm_up_all_days`` function computes the ``max`` like an aggregation, but instead +of returning just the maximum value, it returns a ``DataFrame`` with the same shape as the original one +with the values of each day replaced by the maximum temperature of the city. + +``transform`` is also available for :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.transform` and +:meth:`Resampler.transform`, where it's more common. You can read more about ``transform`` in groupby +operations in :ref:`groupby.transform`. + +When to use: When you need to perform an aggregation that will be returned in the original structure of +the DataFrame. + + +Performance +----------- + +While UDFs provide flexibility, their use is generally discouraged as they can introduce +performance issues, especially when written in pure Python. To improve efficiency, +consider using built-in ``NumPy`` or ``pandas`` functions instead of UDFs +for common operations. + +.. note:: + If performance is critical, explore **vectorized operations** before resorting + to UDFs. + +Vectorized Operations +~~~~~~~~~~~~~~~~~~~~~ + +Below is a comparison of using UDFs versus using Vectorized Operations: + +.. code-block:: python + + # User-defined function + def calc_ratio(row): + return 100 * (row["one"] / row["two"]) + + df["new_col"] = df.apply(calc_ratio, axis=1) + + # Vectorized Operation + df["new_col2"] = 100 * (df["one"] / df["two"]) + +Measuring how long each operation takes: + +.. code-block:: text + + User-defined function: 5.6435 secs + Vectorized: 0.0043 secs + +Vectorized operations in pandas are significantly faster than using :meth:`DataFrame.apply` +with UDFs because they leverage highly optimized C functions +via ``NumPy`` to process entire arrays at once. This approach avoids the overhead of looping +through rows in Python and making separate function calls for each row, which is slow and +inefficient. Additionally, ``NumPy`` arrays benefit from memory efficiency and CPU-level +optimizations, making vectorized operations the preferred choice whenever possible. + + +Improving Performance with UDFs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In scenarios where UDFs are necessary, there are still ways to mitigate their performance drawbacks. +One approach is to use **Numba**, a Just-In-Time (JIT) compiler that can significantly speed up numerical +Python code by compiling Python functions to optimized machine code at runtime. + +By annotating your UDFs with ``@numba.jit``, you can achieve performance closer to vectorized operations, +especially for computationally heavy tasks. + +.. note:: + You may also refer to the user guide on `Enhancing performance `_ + for a more detailed guide to using **Numba**. + +Using :meth:`DataFrame.pipe` for Composable Logic +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Another useful pattern for improving readability and composability, especially when mixing +vectorized logic with UDFs, is to use the :meth:`DataFrame.pipe` method. + +:meth:`DataFrame.pipe` doesn't improve performance directly, but it enables cleaner +method chaining by passing the entire object into a function. This is especially helpful +when chaining custom transformations: + +.. code-block:: python + + def add_ratio_column(df): + df["ratio"] = 100 * (df["one"] / df["two"]) + return df + + df = ( + df + .query("one > 0") + .pipe(add_ratio_column) + .dropna() + ) + +This is functionally equivalent to calling ``add_ratio_column(df)``, but keeps your code +clean and composable. The function you pass to :meth:`DataFrame.pipe` can use vectorized operations, +row-wise UDFs, or any other logic; :meth:`DataFrame.pipe` is agnostic. + +.. note:: + While :meth:`DataFrame.pipe` does not improve performance on its own, + it promotes clean, modular design and allows both vectorized and UDF-based logic + to be composed in method chains. diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 1dd6c5fabef04..9da73c8fd76d4 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 2.3 .. toctree:: :maxdepth: 2 + v2.3.1 v2.3.0 Version 2.2 diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index dcb0d3229aa5d..903632b488cca 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -70,14 +70,14 @@ See the section :ref:`Selection by Position ` for substitutes. Dtypes ~~~~~~ -Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. +Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``), then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. .. ipython:: python - df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float64') df1 df1.dtypes - df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'), + df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float32'), 'B': pd.Series(np.random.randn(8)), 'C': pd.Series(range(8), dtype='uint8')}) df2 diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 08d3a6b188322..f2674938e7726 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -245,7 +245,7 @@ IO enhancements format. (:issue:`3571`, :issue:`1651`, :issue:`3141`) - If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it - with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will + with ``df.to_csv(..., index=False)``), then any ``names`` on the columns index will be *lost*. .. ipython:: python diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index cbf5b7703bd79..b376530358f53 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -353,7 +353,7 @@ Deprecations Index representation ~~~~~~~~~~~~~~~~~~~~ -The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``; if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`) +The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``); if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`) Previous behavior diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 1ae711113773f..0b1f6a2249a6c 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1547,7 +1547,7 @@ Bug fixes - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) - Bug ``Series.isnull()`` and ``Series.notnull()`` ignore ``Period('NaT')`` (:issue:`13737`) -- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737` +- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737`) - Bug in ``.fillna(value=np.nan)`` incorrectly raises ``KeyError`` on a ``category`` dtyped ``Series`` (:issue:`14021`) - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 60e77a8c5d8c5..0f40f5bfa5fc9 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1019,7 +1019,7 @@ operations has been changed to match the arithmetic operations in these cases. The affected cases are: - operating against a 2-dimensional ``np.ndarray`` with either 1 row or 1 column will now broadcast the same way a ``np.ndarray`` would (:issue:`23000`). -- a list or tuple with length matching the number of rows in the :class:`DataFrame` will now raise ``ValueError`` instead of operating column-by-column (:issue:`22880`. +- a list or tuple with length matching the number of rows in the :class:`DataFrame` will now raise ``ValueError`` instead of operating column-by-column (:issue:`22880`). - a list or tuple with length matching the number of columns in the :class:`DataFrame` will now operate row-by-row instead of raising ``ValueError`` (:issue:`22880`). .. ipython:: python @@ -1556,7 +1556,7 @@ Performance improvements (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`, :issue:`21508`) - Improved performance of :meth:`HDFStore.groups` (and dependent functions like - :meth:`HDFStore.keys`. (i.e. ``x in store`` checks are much faster) + :meth:`HDFStore.keys` (i.e. ``x in store`` checks) are much faster) (:issue:`21372`) - Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) - Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 98cb9c4ad7b45..1aac68b90ff2f 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1114,7 +1114,7 @@ Numeric - Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`) - Bug in :class:`NumericIndex` construction that caused indexing to fail when integers in the ``np.uint64`` range were used (:issue:`28023`) - Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`) -- Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`) +- Bug in :meth:`Series.interpolate` when using ``method='index'`` with an unsorted index, would previously return incorrect results. (:issue:`21037`) - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`) - Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`) - Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b199b113d26f2..dff73bef79135 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1039,7 +1039,7 @@ Missing ^^^^^^^ - Calling :meth:`fillna` on an empty :class:`Series` now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). - Bug in :meth:`Series.replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) -- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable Boolean dtype and with ``skipna=False`` (:issue:`33253`) +- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nullable Boolean dtype and with ``skipna=False`` (:issue:`33253`) - Clarified documentation on interpolate with ``method=akima``. The ``der`` parameter must be scalar or ``None`` (:issue:`33426`) - :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`DataFrame.fillna` (:issue:`12918`, :issue:`29146`) - Bug in :meth:`DataFrame.interpolate` when called on a :class:`DataFrame` with column names of string type was throwing a ValueError. The method is now independent of the type of the column names (:issue:`33956`) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 12ab4f27d1e62..ebde7cb14684b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -793,7 +793,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`) - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`) - Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`) -- Bug when subsetting columns on a :class:`.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`) +- Bug when subsetting columns on a :class:`.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']]``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`) - Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7b1aef07e5f00..cf016c882c225 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -666,7 +666,7 @@ be removed in a future version. Use :func:`pandas.concat` instead (:issue:`35407 .. code-block:: ipython - In [1]: pd.Series([1, 2]).append(pd.Series([3, 4]) + In [1]: pd.Series([1, 2]).append(pd.Series([3, 4])) Out [1]: :1: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. 0 1 diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 43aa63c284f38..0bede60758331 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -287,7 +287,7 @@ and attributes without holding entire tree in memory (:issue:`45442`). In [1]: df = pd.read_xml( ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml", - ... iterparse = {"page": ["title", "ns", "id"]}) + ... iterparse = {"page": ["title", "ns", "id"]} ... ) df Out[2]: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 329ef2859f56f..e32417e367427 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -815,8 +815,8 @@ Conversion ^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) +- Bug in :meth:`DataFrame.loc` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) -- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) Strings ^^^^^^^ @@ -826,7 +826,7 @@ Strings - Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) -- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) +- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string())`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 230332319e0ac..6433fe8d2b060 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_230: -What's new in 2.3.0 (Month XX, 2024) +What's new in 2.3.0 (June 4, 2025) ------------------------------------ These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog @@ -10,37 +10,26 @@ including other versions of pandas. .. --------------------------------------------------------------------------- -.. _whatsnew_230.upcoming_changes: - -Upcoming changes in pandas 3.0 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - .. _whatsnew_230.enhancements: Enhancements ~~~~~~~~~~~~ -.. _whatsnew_230.enhancements.enhancement1: - -enhancement1 -^^^^^^^^^^^^ - - .. _whatsnew_230.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ +- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) -- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) -- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- :meth:`Series.str.decode` result now has :class:`StringDtype` when ``future.infer_string`` is True (:issue:`60709`) +- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with :class:`StringDtype` (:issue:`60663`) - Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`) - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) -- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) -- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for :class:`StringDtype` columns (:issue:`60633`) +- The :meth:`~Series.sum` reduction is now implemented for :class:`StringDtype` columns (:issue:`59853`) .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: @@ -50,19 +39,29 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1: +.. _whatsnew_230.notable_bug_fixes.string_comparisons: + +Comparisons between different string dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -notable_bug_fix1 -^^^^^^^^^^^^^^^^ +In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy + + object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA) + +in determining the result dtype when there are different string dtypes compared. Some examples: + +- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``. +- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array. +- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array. .. _whatsnew_230.api_changes: API changes ~~~~~~~~~~~ -- When enabling the ``future.infer_string`` option: Index set operations (like - union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or - empty ``Index`` with object dtype when determining the dtype of the resulting +- When enabling the ``future.infer_string`` option, :class:`Index` set operations (like + union or intersection) will now ignore the dtype of an empty :class:`RangeIndex` or + empty :class:`Index` with ``object`` dtype when determining the dtype of the resulting Index (:issue:`60797`) .. --------------------------------------------------------------------------- @@ -73,119 +72,35 @@ Deprecations - Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) - Deprecated the ``"pyarrow_numpy"`` storage option for :class:`StringDtype` (:issue:`60152`) -.. --------------------------------------------------------------------------- -.. _whatsnew_230.performance: - -Performance improvements -~~~~~~~~~~~~~~~~~~~~~~~~ -- -- - .. --------------------------------------------------------------------------- .. _whatsnew_230.bug_fixes: Bug fixes ~~~~~~~~~ -Categorical -^^^^^^^^^^^ -- -- - -Datetimelike -^^^^^^^^^^^^ -- -- - -Timedelta -^^^^^^^^^ -- -- - -Timezones -^^^^^^^^^ -- -- - Numeric ^^^^^^^ -- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`) -- - -Conversion -^^^^^^^^^^ -- -- +- Bug in :meth:`Series.mode` and :meth:`DataFrame.mode` with ``dropna=False`` where not all dtypes would sort in the presence of ``NA`` values (:issue:`60702`) +- Bug in :meth:`Series.round` where a ``TypeError`` would always raise with ``object`` dtype (:issue:`61206`) Strings ^^^^^^^ -- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`) -- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) -- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) +- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`) +- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`) +- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` where an ``Exception`` was not raised for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) +- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` that incorrectly returned integer results with ``method="average"`` and raised an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) +- Bug in :meth:`Series.str.center` with :class:`StringDtype` with ``storage="pyarrow"`` not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) -- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) -- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) - -Interval -^^^^^^^^ -- -- +- Bug in :meth:`Series.str.slice` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) Indexing ^^^^^^^^ -- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) -- - -Missing -^^^^^^^ -- -- - -MultiIndex -^^^^^^^^^^ -- -- +- Bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) I/O ^^^ -- :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`) -- - -Period -^^^^^^ -- -- - -Plotting -^^^^^^^^ -- -- - -Groupby/resample/rolling -^^^^^^^^^^^^^^^^^^^^^^^^ -- -- - -Reshaping -^^^^^^^^^ -- -- - -Sparse -^^^^^^ -- -- - -ExtensionArray -^^^^^^^^^^^^^^ -- -- - -Styler -^^^^^^ -- -- +- Bug in :meth:`DataFrame.to_excel` which stored decimals as strings instead of numbers (:issue:`49598`) Other ^^^^^ @@ -197,3 +112,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.2.3..v2.3.0|HEAD diff --git a/doc/source/whatsnew/v2.3.1.rst b/doc/source/whatsnew/v2.3.1.rst new file mode 100644 index 0000000000000..c9d8f04250c23 --- /dev/null +++ b/doc/source/whatsnew/v2.3.1.rst @@ -0,0 +1,42 @@ +.. _whatsnew_231: + +What's new in 2.3.1 (Month XX, 2025) +------------------------------------ + +These are the changes in pandas 2.3.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_231.enhancements: + +Enhancements +~~~~~~~~~~~~ +- + +.. _whatsnew_231.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_231.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_231.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_231.contributors: + +Contributors +~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2b437734a451a..8d3ac0e396430 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,7 +30,6 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) -- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` @@ -52,6 +51,7 @@ Other enhancements - :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`) - :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) +- :func:`set_option` now accepts a dictionary of options, simplifying configuration of multiple settings at once (:issue:`61093`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) @@ -61,21 +61,28 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering. - :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`) - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) +- :class:`Easter` has gained a new constructor argument ``method`` which specifies the method used to calculate Easter — for example, Orthodox Easter (:issue:`61665`) +- :class:`Holiday` constructor argument ``days_of_week`` will raise a ``ValueError`` when type is something other than ``None`` or ``tuple`` (:issue:`61658`) +- :class:`Holiday` has gained the constructor argument and field ``exclude_dates`` to exclude specific datetimes from a custom holiday calendar (:issue:`54382`) - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) +- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) +- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`) - :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`). - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`) +- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Improved deprecation message for offset aliases (:issue:`60820`) @@ -313,12 +320,40 @@ Optional libraries below the lowest tested version may still work, but are not c +========================+=====================+ | pytz | 2023.4 | +------------------------+---------------------+ -| fastparquet | 2023.10.0 | +| fastparquet | 2024.2.0 | +------------------------+---------------------+ | adbc-driver-postgresql | 0.10.0 | +------------------------+---------------------+ | mypy (dev) | 1.9.0 | +------------------------+---------------------+ +| beautifulsoup4 | 4.12.3 | ++------------------------+---------------------+ +| fsspec | 2024.2.0 | ++------------------------+---------------------+ +| gcsfs | 2024.2.0 | ++------------------------+---------------------+ +| s3fs | 2024.2.0 | ++------------------------+---------------------+ +| Jinja2 | 3.1.3 | ++------------------------+---------------------+ +| matplotlib | 3.8.3 | ++------------------------+---------------------+ +| numba | 0.59.0 | ++------------------------+---------------------+ +| numexpr | 2.9.0 | ++------------------------+---------------------+ +| pymysql | 1.1.0 | ++------------------------+---------------------+ +| pyreadstat | 1.2.6 | ++------------------------+---------------------+ +| SciPy | 1.12.0 | ++------------------------+---------------------+ +| xarray | 2024.1.0 | ++------------------------+---------------------+ +| xlsxwriter | 3.2.0 | ++------------------------+---------------------+ +| zstandard | 0.22.0 | ++------------------------+---------------------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -420,6 +455,7 @@ Other Deprecations - Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) - Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`) +- Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) .. --------------------------------------------------------------------------- @@ -591,6 +627,7 @@ Performance improvements - :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) +- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) @@ -621,6 +658,7 @@ Performance improvements - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) +- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`) - Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) @@ -636,6 +674,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) +- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) - @@ -648,6 +687,7 @@ Datetimelike - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) +- Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`) - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) @@ -658,6 +698,7 @@ Datetimelike - Bug in :meth:`to_datetime` on float array with missing values throwing ``FloatingPointError`` (:issue:`58419`) - Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`) - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) +- Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`) - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) @@ -668,13 +709,16 @@ Timedelta Timezones ^^^^^^^^^ -- +- Bug in :meth:`DatetimeIndex.union`, :meth:`DatetimeIndex.intersection`, and :meth:`DatetimeIndex.symmetric_difference` changing timezone to UTC when merging two DatetimeIndex objects with the same timezone but different units (:issue:`60080`) - Numeric ^^^^^^^ - Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`) +- Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`) - Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) +- Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`) +- Bug in :meth:`Series.std` and :meth:`Series.var` when using complex-valued data (:issue:`61645`) - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`) Conversion @@ -732,10 +776,12 @@ I/O - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) +- Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) +- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) @@ -749,6 +795,7 @@ I/O - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) +- Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) Period @@ -759,22 +806,30 @@ Period Plotting ^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) +- Bug in :meth:`DataFrame.plot.bar` when ``subplots`` and ``stacked=True`` are used in conjunction which causes incorrect stacking. (:issue:`61018`) - Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`) - Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`) - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`) +- Bug in :meth:`DataFrame.plot` where ``title`` would require extra titles when plotting more than one column per subplot. (:issue:`61019`) +- Bug in :meth:`Series.plot` preventing a line and bar from being aligned on the same plot (:issue:`61161`) +- Bug in :meth:`Series.plot` preventing a line and scatter plot from being aligned (:issue:`61005`) - Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) - Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) +- Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`) - Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) +- Bug in :meth:`DataFrameGroupBy.agg` where applying a user-defined function to an empty DataFrame returned a Series instead of an empty DataFrame. (:issue:`61503`) - Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) @@ -783,7 +838,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) -- Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) +- Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping ^^^^^^^^^ @@ -796,9 +851,12 @@ Reshaping - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) +- Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`) - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) +- Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`) +- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) Sparse ^^^^^^ @@ -825,17 +883,21 @@ Other - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) +- Bug in :func:`eval` where method calls on binary operations like ``(x + y).dropna()`` would raise ``AttributeError: 'BinOp' object has no attribute 'value'`` (:issue:`61175`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) - Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) +- Bug in :meth:`DataFrame.apply` raising ``RecursionError`` when passing ``func=list[int]``. (:issue:`61565`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) - Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`) - Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) +- Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`) - Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) +- Bug in :meth:`DataFrame.sort_values` where sorting by a column explicitly named ``None`` raised a ``KeyError`` instead of sorting by the column as expected. (:issue:`61512`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) @@ -855,6 +917,7 @@ Other - Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) +- Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index a8c8b20e20fe4..74186bd2581c4 100644 --- a/environment.yml +++ b/environment.yml @@ -3,12 +3,12 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.10 + - python=3.11 - pip # build dependencies - versioneer - - cython~=3.0.5 + - cython<4.0.0a0 - meson=1.2.1 - meson-python=0.13.1 @@ -23,43 +23,42 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy<3 # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2022.11.0 + - gcsfs>=2023.12.2 - ipython - pickleshare # Needed for IPython Sphinx directive in the docs GH#60429 - - jinja2>=3.1.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - openpyxl>=3.1.0 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 + - openpyxl>=3.1.2 - odfpy>=1.4.1 - - py - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyreadstat>=1.2.0 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0, <=2024.9.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 # downstream packages - dask-core @@ -80,12 +79,10 @@ dependencies: - flake8=7.1.0 # run in subprocess over docstring examples - mypy=1.13.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - - pre-commit>=4.0.1 + - pre-commit>=4.2.0 # documentation - gitpython # obtain contributors from git for whatsnew - - gitdb - - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - pydata-sphinx-theme=0.16 diff --git a/meson.build b/meson.build index 66583095a6e77..6a00e52481108 100644 --- a/meson.build +++ b/meson.build @@ -47,6 +47,28 @@ endif cy = meson.get_compiler('cython') if cy.version().version_compare('>=3.1.0') add_project_arguments('-Xfreethreading_compatible=true', language: 'cython') + + # Use shared utility code to reduce wheel sizes + # copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files + cy = find_program(cy.cmd_array()[0]) + cython_shared_src = custom_target( + install: false, + output: '_cyutility.c', + command: [ + cy, + '-3', + '-Xfreethreading_compatible=true', + '--fast-fail', + '--generate-shared=' + meson.current_build_dir() / '_cyutility.c', + ], + ) + + py.extension_module( + '_cyutility', + cython_shared_src, + subdir: 'pandas/_libs', + install: true, + ) endif # Needed by pandas.test() when it looks for the pytest ini options diff --git a/pandas/__init__.py b/pandas/__init__.py index c570fb8d70204..8b92ad6cdfebb 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -3,20 +3,18 @@ __docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies -_hard_dependencies = ("numpy", "dateutil") -_missing_dependencies = [] +_hard_dependencies = ("numpy", "dateutil", "tzdata") for _dependency in _hard_dependencies: try: __import__(_dependency) except ImportError as _e: # pragma: no cover - _missing_dependencies.append(f"{_dependency}: {_e}") + raise ImportError( + f"Unable to import required dependency {_dependency}. " + "Please see the traceback for details." + ) from _e -if _missing_dependencies: # pragma: no cover - raise ImportError( - "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies) - ) -del _hard_dependencies, _dependency, _missing_dependencies +del _hard_dependencies, _dependency try: # numpy compat @@ -166,6 +164,7 @@ read_stata, read_sas, read_spss, + read_iceberg, ) from pandas.io.json._normalize import json_normalize @@ -321,6 +320,7 @@ "read_fwf", "read_hdf", "read_html", + "read_iceberg", "read_json", "read_orc", "read_parquet", diff --git a/pandas/_config/config.py b/pandas/_config/config.py index ce53e05608ba7..d42d90d44f82f 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -199,9 +199,9 @@ def set_option(*args) -> None: Parameters ---------- - *args : str | object - Arguments provided in pairs, which will be interpreted as (pattern, value) - pairs. + *args : str | object | dict + Arguments provided in pairs, which will be interpreted as (pattern, value), + or as a single dictionary containing multiple option-value pairs. pattern: str Regexp which should match a single option value: object @@ -239,6 +239,8 @@ def set_option(*args) -> None: Examples -------- + Option-Value Pair Input: + >>> pd.set_option("display.max_columns", 4) >>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) >>> df @@ -247,8 +249,23 @@ def set_option(*args) -> None: 1 6 7 ... 9 10 [2 rows x 5 columns] >>> pd.reset_option("display.max_columns") + + Dictionary Input: + + >>> pd.set_option({"display.max_columns": 4, "display.precision": 1}) + >>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) + >>> df + 0 1 ... 3 4 + 0 1 2 ... 4 5 + 1 6 7 ... 9 10 + [2 rows x 5 columns] + >>> pd.reset_option("display.max_columns") + >>> pd.reset_option("display.precision") """ - # must at least 1 arg deal with constraints later + # Handle dictionary input + if len(args) == 1 and isinstance(args[0], dict): + args = tuple(kv for item in args[0].items() for kv in item) + nargs = len(args) if not nargs or nargs % 2 != 0: raise ValueError("Must provide an even number of non-keyword arguments") @@ -440,9 +457,10 @@ def option_context(*args) -> Generator[None]: Parameters ---------- - *args : str | object + *args : str | object | dict An even amount of arguments provided in pairs which will be - interpreted as (pattern, value) pairs. + interpreted as (pattern, value) pairs. Alternatively, a single + dictionary of {pattern: value} may be provided. Returns ------- @@ -471,7 +489,12 @@ def option_context(*args) -> Generator[None]: >>> from pandas import option_context >>> with option_context("display.max_rows", 10, "display.max_columns", 5): ... pass + >>> with option_context({"display.max_rows": 10, "display.max_columns": 5}): + ... pass """ + if len(args) == 1 and isinstance(args[0], dict): + args = tuple(kv for item in args[0].items() for kv in item) + if len(args) % 2 != 0 or len(args) < 2: raise ValueError( "Provide an even amount of arguments as " diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 9dfa4a9486558..f584c0ff9f614 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -391,10 +391,11 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): # clip `covxy / divisor` to ensure coeff is within bounds if divisor != 0: val = covxy / divisor - if val > 1.0: - val = 1.0 - elif val < -1.0: - val = -1.0 + if not cov: + if val > 1.0: + val = 1.0 + elif val < -1.0: + val = -1.0 result[xi, yi] = result[yi, xi] = val else: result[xi, yi] = result[yi, xi] = NaN diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index dda23d9dec98b..60e4ff3fab74e 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -1,4 +1,4 @@ -from typing import Sequence +from collections.abc import Sequence import numpy as np diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 7a810a988e50e..5ee359d84a6ed 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -1,6 +1,6 @@ +from collections.abc import Hashable from typing import ( Any, - Hashable, Literal, overload, ) diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index ffe6c7730bcdc..a680304d55ea2 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -1,6 +1,8 @@ -from typing import ( +from collections.abc import ( Iterator, Sequence, +) +from typing import ( final, overload, ) diff --git a/pandas/_libs/json.pyi b/pandas/_libs/json.pyi index bc4fe68573b94..349320d69d707 100644 --- a/pandas/_libs/json.pyi +++ b/pandas/_libs/json.pyi @@ -1,6 +1,6 @@ +from collections.abc import Callable from typing import ( Any, - Callable, ) def ujson_dumps( diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index daaaacee3487d..331233f37f63d 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -1,12 +1,14 @@ # TODO(npdtypes): Many types specified here can be made more specific/accurate; # the more specific versions are specified in comments +from collections.abc import ( + Callable, + Generator, + Hashable, +) from decimal import Decimal from typing import ( Any, - Callable, Final, - Generator, - Hashable, Literal, TypeAlias, overload, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 38d9a8f62417c..3b7d659c2150e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2,6 +2,7 @@ from collections import abc from decimal import Decimal from enum import Enum from sys import getsizeof +from types import GenericAlias from typing import ( Literal, _GenericAlias, @@ -777,7 +778,10 @@ cpdef ndarray[object] ensure_string_array( return out arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): - arr = np.array(arr, dtype="object") + # GH#61155: Guarantee a 1-d result when array is a list of lists + input_arr = arr + arr = np.empty(len(arr), dtype="object") + arr[:] = input_arr result = np.asarray(arr, dtype="object") @@ -1295,7 +1299,7 @@ cdef bint c_is_list_like(object obj, bint allow_sets) except -1: getattr(obj, "__iter__", None) is not None and not isinstance(obj, type) # we do not count strings/unicode/bytes as list-like # exclude Generic types that have __iter__ - and not isinstance(obj, (str, bytes, _GenericAlias)) + and not isinstance(obj, (str, bytes, _GenericAlias, GenericAlias)) # exclude zero-dimensional duck-arrays, effectively scalars and not (hasattr(obj, "ndim") and obj.ndim == 0) # exclude sets if allow_sets is False diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index a50976767928a..33fc65e5034d0 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -148,6 +148,12 @@ if get_option('buildtype') == 'debug' cython_args += ['--gdb'] endif +# Use shared utility code to reduce wheel sizes +# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files +if cy.version().version_compare('>=3.1.0') + cython_args += ['--shared=pandas._libs._cyutility'] +endif + foreach ext_name, ext_dict : libs_sources py.extension_module( ext_name, diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index 6738a1dff4a9e..81fe81930539d 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -1,7 +1,9 @@ -from typing import ( - Any, +from collections.abc import ( Callable, Iterable, +) +from typing import ( + Any, Literal, TypeAlias, overload, diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 253bb7303cefb..d18f54c546232 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -1,5 +1,5 @@ +from collections.abc import Hashable from typing import ( - Hashable, Literal, ) diff --git a/pandas/_libs/properties.pyi b/pandas/_libs/properties.pyi index aaa44a0cf47bf..bbde6ec454202 100644 --- a/pandas/_libs/properties.pyi +++ b/pandas/_libs/properties.pyi @@ -1,5 +1,5 @@ +from collections.abc import Sequence from typing import ( - Sequence, overload, ) diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi index 536265b25425e..8727b1a5b0420 100644 --- a/pandas/_libs/sparse.pyi +++ b/pandas/_libs/sparse.pyi @@ -1,4 +1,4 @@ -from typing import Sequence +from collections.abc import Sequence import numpy as np diff --git a/pandas/_libs/testing.pyi b/pandas/_libs/testing.pyi index ab87e58eba9b9..4758483b3b5e7 100644 --- a/pandas/_libs/testing.pyi +++ b/pandas/_libs/testing.pyi @@ -1,4 +1,4 @@ -from typing import Mapping +from collections.abc import Mapping def assert_dict_equal(a: Mapping, b: Mapping, compare_keys: bool = ...) -> bool: ... def assert_almost_equal( diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index c4acf72ab87d8..45552108f8c15 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -797,7 +797,7 @@ cdef int64_t parse_pydatetime( dts : *npy_datetimestruct Needed to use in pydatetime_to_dt64, which writes to it. creso : NPY_DATETIMEUNIT - Resolution to store the the result. + Resolution to store the result. Raises ------ diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 052a8568b76af..ac43dc7db5fb7 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -28,6 +28,12 @@ if get_option('buildtype') == 'debug' cython_args += ['--gdb'] endif +# Use shared utility code to reduce wheel sizes +# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files +if cy.version().version_compare('>=3.1.0') + cython_args += ['--shared=pandas._libs._cyutility'] +endif + foreach ext_name, ext_dict : tslibs_sources py.extension_module( ext_name, diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index d3b10fbe79cb9..ff3bb5b70801e 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -1,7 +1,5 @@ from datetime import ( - date as date_, datetime, - time as time_, timedelta, tzinfo as _tzinfo, ) @@ -99,7 +97,6 @@ class NaTType: ambiguous: bool | Literal["raise"] | NaTType = ..., nonexistent: TimestampNonexistent = ..., ) -> NaTType: ... - def combine(cls, date: date_, time: time_) -> NoReturn: ... @property def tzinfo(self) -> None: ... @property diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index f9f56d38c5e0a..a71aa42b4f671 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -1,3 +1,4 @@ +from collections.abc import Collection from datetime import ( datetime, time, @@ -5,7 +6,6 @@ from datetime import ( ) from typing import ( Any, - Collection, Literal, TypeVar, overload, @@ -230,7 +230,13 @@ class FY5253Quarter(FY5253Mixin): variation: Literal["nearest", "last"] = ..., ) -> None: ... -class Easter(SingleConstructorOffset): ... +class Easter(SingleConstructorOffset): + def __init__( + self, + n: int = ..., + normalize: bool = ..., + method: int = ..., + ) -> None: ... class _CustomBusinessMonth(BusinessMixin): def __init__( diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index a16964435ef50..87214c3758d5c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4520,6 +4520,12 @@ cdef class Easter(SingleConstructorOffset): The number of years represented. normalize : bool, default False Normalize start/end dates to midnight before generating date range. + method : int, default 3 + The method used to calculate the date of Easter. Valid options are: + - 1 (EASTER_JULIAN): Original calculation in Julian calendar + - 2 (EASTER_ORTHODOX): Original method, date converted to Gregorian calendar + - 3 (EASTER_WESTERN): Revised method, in Gregorian calendar + These constants are defined in the `dateutil.easter` module. See Also -------- @@ -4532,15 +4538,32 @@ cdef class Easter(SingleConstructorOffset): Timestamp('2022-04-17 00:00:00') """ + _attributes = tuple(["n", "normalize", "method"]) + + cdef readonly: + int method + + from dateutil.easter import EASTER_WESTERN + + def __init__(self, n=1, normalize=False, method=EASTER_WESTERN): + BaseOffset.__init__(self, n, normalize) + + self.method = method + + if method < 1 or method > 3: + raise ValueError(f"Method must be 1<=method<=3, got {method}") + cpdef __setstate__(self, state): + from dateutil.easter import EASTER_WESTERN self.n = state.pop("n") self.normalize = state.pop("normalize") + self.method = state.pop("method", EASTER_WESTERN) @apply_wraps def _apply(self, other: datetime) -> datetime: from dateutil.easter import easter - current_easter = easter(other.year) + current_easter = easter(other.year, method=self.method) current_easter = datetime( current_easter.year, current_easter.month, current_easter.day ) @@ -4555,7 +4578,7 @@ cdef class Easter(SingleConstructorOffset): # NOTE: easter returns a datetime.date so we have to convert to type of # other - new = easter(other.year + n) + new = easter(other.year + n, method=self.method) new = datetime( new.year, new.month, @@ -4573,7 +4596,7 @@ cdef class Easter(SingleConstructorOffset): from dateutil.easter import easter - return date(dt.year, dt.month, dt.day) == easter(dt.year) + return date(dt.year, dt.month, dt.day) == easter(dt.year, method=self.method) # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index fb89f1328529d..b443aa7bede22 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -444,6 +444,9 @@ def array_strptime( else: val = str(val) + out_local = 0 + out_tzoffset = 0 + if fmt == "ISO8601": string_to_dts_succeeded = not string_to_dts( val, &dts, &out_bestunit, &out_local, diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 979a5666661b2..c885543b2fc6d 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -3,7 +3,6 @@ from typing import ( ClassVar, Literal, TypeAlias, - TypeVar, overload, ) @@ -60,7 +59,6 @@ UnitChoices: TypeAlias = Literal[ "nanos", "nanosecond", ] -_S = TypeVar("_S", bound=timedelta) def get_unit_for_round(freq, creso: int) -> int: ... def disallow_ambiguous_unit(unit: str | None) -> None: ... @@ -95,11 +93,11 @@ class Timedelta(timedelta): _value: int # np.int64 # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") def __new__( # type: ignore[misc] - cls: type[_S], + cls: type[Self], value=..., unit: str | None = ..., **kwargs: float | np.integer | np.floating, - ) -> _S | NaTType: ... + ) -> Self | NaTType: ... @classmethod def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ... @property diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 23197b9a55afc..390267db8267f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -200,8 +200,9 @@ class MinMaxReso: See also: timedeltas.MinMaxReso """ - def __init__(self, name): + def __init__(self, name, docstring): self._name = name + self.__doc__ = docstring def __get__(self, obj, type=None): cls = Timestamp @@ -216,11 +217,15 @@ class MinMaxReso: if obj is None: # i.e. this is on the class, default to nanos - return cls(val) + result = cls(val) elif self._name == "resolution": - return Timedelta._from_value_and_reso(val, obj._creso) + result = Timedelta._from_value_and_reso(val, obj._creso) else: - return Timestamp._from_value_and_reso(val, obj._creso, tz=None) + result = Timestamp._from_value_and_reso(val, obj._creso, tz=None) + + result.__doc__ = self.__doc__ + + return result def __set__(self, obj, value): raise AttributeError(f"{self._name} is not settable.") @@ -235,9 +240,74 @@ cdef class _Timestamp(ABCTimestamp): dayofweek = _Timestamp.day_of_week dayofyear = _Timestamp.day_of_year - min = MinMaxReso("min") - max = MinMaxReso("max") - resolution = MinMaxReso("resolution") # GH#21336, GH#21365 + _docstring_min = """ + Returns the minimum bound possible for Timestamp. + + This property provides access to the smallest possible value that + can be represented by a Timestamp object. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.max: Returns the maximum bound possible for Timestamp. + Timestamp.resolution: Returns the smallest possible difference between + non-equal Timestamp objects. + + Examples + -------- + >>> pd.Timestamp.min + Timestamp('1677-09-21 00:12:43.145224193') + """ + + _docstring_max = """ + Returns the maximum bound possible for Timestamp. + + This property provides access to the largest possible value that + can be represented by a Timestamp object. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.min: Returns the minimum bound possible for Timestamp. + Timestamp.resolution: Returns the smallest possible difference between + non-equal Timestamp objects. + + Examples + -------- + >>> pd.Timestamp.max + Timestamp('2262-04-11 23:47:16.854775807') + """ + + _docstring_reso = """ + Returns the smallest possible difference between non-equal Timestamp objects. + + The resolution value is determined by the underlying representation of time + units and is equivalent to Timedelta(nanoseconds=1). + + Returns + ------- + Timedelta + + See Also + -------- + Timestamp.max: Returns the maximum bound possible for Timestamp. + Timestamp.min: Returns the minimum bound possible for Timestamp. + + Examples + -------- + >>> pd.Timestamp.resolution + Timedelta('0 days 00:00:00.000000001') + """ + + min = MinMaxReso("min", _docstring_min) + max = MinMaxReso("max", _docstring_max) + resolution = MinMaxReso("resolution", _docstring_reso) # GH#21336, GH#21365 @property def value(self) -> int: diff --git a/pandas/_libs/tslibs/timezones.pyi b/pandas/_libs/tslibs/timezones.pyi index 4e9f0c6ae6c33..26ffa568a8480 100644 --- a/pandas/_libs/tslibs/timezones.pyi +++ b/pandas/_libs/tslibs/timezones.pyi @@ -1,8 +1,8 @@ +from collections.abc import Callable from datetime import ( datetime, tzinfo, ) -from typing import Callable import numpy as np diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index 2108fa0f35547..07ee46858577a 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -1,8 +1,8 @@ +from collections.abc import Iterable from datetime import ( timedelta, tzinfo, ) -from typing import Iterable import numpy as np diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index b4bdd7e05cf0e..99413751cd5c2 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -1,6 +1,6 @@ +from collections.abc import Callable from typing import ( Any, - Callable, Literal, ) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 2baed13cbd7be..0c8ea28b60ce8 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -6,6 +6,7 @@ from libc.math cimport ( sqrt, ) from libcpp.deque cimport deque +from libcpp.stack cimport stack from libcpp.unordered_map cimport unordered_map from pandas._libs.algos cimport TiebreakEnumType @@ -988,39 +989,29 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, # ---------------------------------------------------------------------- -# Moving maximum / minimum code taken from Bottleneck -# Licence at LICENSES/BOTTLENECK_LICENCE - - -cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) noexcept nogil: - - if ai == ai: - nobs[0] = nobs[0] + 1 - elif is_max: - ai = MINfloat64 - else: - ai = MAXfloat64 - - return ai - - -cdef void remove_mm(float64_t aold, Py_ssize_t *nobs) noexcept nogil: - """ remove a value from the mm calc """ - if aold == aold: - nobs[0] = nobs[0] - 1 - - -cdef float64_t calc_mm(int64_t minp, Py_ssize_t nobs, - float64_t value) noexcept nogil: - cdef: - float64_t result +cdef int64_t bisect_left( + deque[int64_t]& a, + int64_t x, + int64_t lo=0, + int64_t hi=-1 +) nogil: + """Same as https://docs.python.org/3/library/bisect.html.""" + + cdef int64_t mid + if hi == -1: + hi = a.size() + while lo < hi: + mid = (lo + hi) // 2 + if a.at(mid) < x: + lo = mid + 1 + else: + hi = mid + return lo - if nobs >= minp: - result = value - else: - result = NaN +from libc.math cimport isnan - return result +# Prior version of moving maximum / minimum code taken from Bottleneck +# Licence at LICENSES/BOTTLENECK_LICENCE def roll_max(ndarray[float64_t] values, ndarray[int64_t] start, @@ -1068,69 +1059,110 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start, return _roll_min_max(values, start, end, minp, is_max=0) -cdef _roll_min_max(ndarray[float64_t] values, - ndarray[int64_t] starti, - ndarray[int64_t] endi, - int64_t minp, - bint is_max): +def _roll_min_max( + ndarray[float64_t] values, + ndarray[int64_t] start, + ndarray[int64_t] end, + int64_t minp, + bint is_max +): cdef: - float64_t ai - int64_t curr_win_size, start - Py_ssize_t i, k, nobs = 0, N = len(starti) - deque Q[int64_t] # min/max always the front - deque W[int64_t] # track the whole window for nobs compute + Py_ssize_t i, i_next, k, valid_start, last_end, last_start, N = len(start) + # Indices of bounded extrema in `values`. `candidates[i]` is always increasing. + # `values[candidates[i]]` is decreasing for max and increasing for min. + deque candidates[int64_t] + # Indices of largest windows that "cover" preceding windows. + stack dominators[int64_t] ndarray[float64_t, ndim=1] output + Py_ssize_t this_start, this_end, stash_start + int64_t q_idx + output = np.empty(N, dtype=np.float64) - Q = deque[int64_t]() - W = deque[int64_t]() + candidates = deque[int64_t]() + dominators = stack[int64_t]() + + # This function was "ported" / translated from sliding_min_max() + # in /pandas/core/_numba/kernels/min_max_.py. + # (See there for credits and some comments.) + # Code translation assumptions/rules: + # - min_periods --> minp + # - deque[0] --> front() + # - deque[-1] --> back() + # - stack[-1] --> top() + # - bool(stack/deque) --> !empty() + # - deque.append() --> push_back() + # - stack.append() --> push() + # - deque.popleft --> pop_front() + # - deque.pop() --> pop_back() with nogil: + if minp < 1: + minp = 1 + + if N>2: + i_next = N - 1 + for i in range(N - 2, -1, -1): + if start[i_next] < start[i] \ + and ( + dominators.empty() + or start[dominators.top()] > start[i_next] + ): + dominators.push(i_next) + i_next = i + + # NaN tracking to guarantee minp + valid_start = -minp + + last_end = 0 + last_start = -1 - # This is using a modified version of the C++ code in this - # SO post: https://stackoverflow.com/a/12239580 - # The original impl didn't deal with variable window sizes - # So the code was optimized for that - - # first window's size - curr_win_size = endi[0] - starti[0] - # GH 32865 - # Anchor output index to values index to provide custom - # BaseIndexer support for i in range(N): + this_start = start[i] + this_end = end[i] - curr_win_size = endi[i] - starti[i] - if i == 0: - start = starti[i] - else: - start = endi[i - 1] - - for k in range(start, endi[i]): - ai = init_mm(values[k], &nobs, is_max) - # Discard previous entries if we find new min or max - if is_max: - while not Q.empty() and ((ai >= values[Q.back()]) or - values[Q.back()] != values[Q.back()]): - Q.pop_back() - else: - while not Q.empty() and ((ai <= values[Q.back()]) or - values[Q.back()] != values[Q.back()]): - Q.pop_back() - Q.push_back(k) - W.push_back(k) - - # Discard entries outside and left of current window - while not Q.empty() and Q.front() <= starti[i] - 1: - Q.pop_front() - while not W.empty() and W.front() <= starti[i] - 1: - remove_mm(values[W.front()], &nobs) - W.pop_front() - - # Save output based on index in input value array - if not Q.empty() and curr_win_size > 0: - output[i] = calc_mm(minp, nobs, values[Q.front()]) + if (not dominators.empty() and dominators.top() == i): + dominators.pop() + + if not (this_end > last_end + or (this_end == last_end and this_start >= last_start)): + raise ValueError( + "Start/End ordering requirement is violated at index {}".format(i)) + + if dominators.empty(): + stash_start = this_start else: + stash_start = min(this_start, start[dominators.top()]) + + while not candidates.empty() and candidates.front() < stash_start: + candidates.pop_front() + + for k in range(last_end, this_end): + if not isnan(values[k]): + valid_start += 1 + while valid_start >= 0 and isnan(values[valid_start]): + valid_start += 1 + + if is_max: + while (not candidates.empty() + and values[k] >= values[candidates.back()]): + candidates.pop_back() + else: + while (not candidates.empty() + and values[k] <= values[candidates.back()]): + candidates.pop_back() + candidates.push_back(k) + + if candidates.empty() or this_start > valid_start: output[i] = NaN + elif candidates.front() >= this_start: + # ^^ This is here to avoid costly bisection for fixed window sizes. + output[i] = values[candidates.front()] + else: + q_idx = bisect_left(candidates, this_start, lo=1) + output[i] = values[candidates[q_idx]] + last_end = this_end + last_start = this_start return output @@ -1322,8 +1354,8 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, if interpolation_type == LINEAR: vlow = skiplist_get(skiplist, idx, &ret) vhigh = skiplist_get(skiplist, idx + 1, &ret) - output[i] = ((vlow + (vhigh - vlow) * - (idx_with_fraction - idx))) + output[i] = (vlow + (vhigh - vlow) * + (idx_with_fraction - idx)) elif interpolation_type == LOWER: output[i] = skiplist_get(skiplist, idx, &ret) elif interpolation_type == HIGHER: diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index 1d49bba47e139..8c00a98b1241a 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -1,7 +1,14 @@ +cy_args = ['-X always_allow_keywords=true'] +# Use shared utility code to reduce wheel sizes +# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files +if cy.version().version_compare('>=3.1.0') + cython_args += ['--shared=pandas._libs._cyutility'] +endif + py.extension_module( 'aggregations', ['aggregations.pyx'], - cython_args: ['-X always_allow_keywords=true'], + cython_args: cy_args, include_directories: [inc_np, inc_pd], subdir: 'pandas/_libs/window', override_options: ['cython_language=cpp'], @@ -11,7 +18,7 @@ py.extension_module( py.extension_module( 'indexers', ['indexers.pyx'], - cython_args: ['-X always_allow_keywords=true'], + cython_args: cy_args, include_directories: [inc_np, inc_pd], subdir: 'pandas/_libs/window', install: true, diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index 99826de51e1bf..da147c117ad43 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -3,6 +3,7 @@ from contextlib import contextmanager import os from pathlib import Path +import sys import tempfile from typing import ( IO, @@ -81,7 +82,9 @@ def setTZ(tz) -> None: pass else: os.environ["TZ"] = tz - time.tzset() + # Next line allows typing checks to pass on Windows + if sys.platform != "win32": + time.tzset() orig_tz = os.environ.get("TZ") setTZ(tz) diff --git a/pandas/_typing.py b/pandas/_typing.py index 4365ee85f72e3..889252bb00438 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,5 +1,6 @@ from __future__ import annotations +from builtins import type as type_t # pyright: ignore[reportUnusedImport] from collections.abc import ( Callable, Hashable, @@ -20,22 +21,23 @@ TYPE_CHECKING, Any, Literal, - Optional, Protocol, - Type as type_t, + TypeAlias, TypeVar, Union, overload, ) import numpy as np +import numpy.typing as npt # To prevent import cycles place any internal imports in the branch below # and use a string literal forward reference to it in subsequent types # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles -if TYPE_CHECKING: - import numpy.typing as npt +# Note that Union is needed when a Union includes a pandas type + +if TYPE_CHECKING: from pandas._libs import ( NaTType, Period, @@ -76,19 +78,12 @@ from pandas.io.formats.format import EngFormatter from pandas.tseries.holiday import AbstractHolidayCalendar - ScalarLike_co = Union[ - int, - float, - complex, - str, - bytes, - np.generic, - ] + ScalarLike_co: TypeAlias = int | float | complex | str | bytes | np.generic # numpy compatible types - NumpyValueArrayLike = Union[ScalarLike_co, npt.ArrayLike] + NumpyValueArrayLike: TypeAlias = ScalarLike_co | npt.ArrayLike # Name "npt._ArrayLikeInt_co" is not defined [name-defined] - NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined] + NumpySorter: TypeAlias = npt._ArrayLikeInt_co | None # type: ignore[name-defined] from typing import ( ParamSpec, @@ -107,7 +102,6 @@ from typing_extensions import Unpack # pyright: ignore[reportUnusedImport] else: - npt: Any = None ParamSpec: Any = None Self: Any = None TypeGuard: Any = None @@ -120,10 +114,10 @@ # array-like -ArrayLike = Union["ExtensionArray", np.ndarray] +ArrayLike: TypeAlias = Union["ExtensionArray", np.ndarray] ArrayLikeT = TypeVar("ArrayLikeT", "ExtensionArray", np.ndarray) -AnyArrayLike = Union[ArrayLike, "Index", "Series"] -TimeArrayLike = Union["DatetimeArray", "TimedeltaArray"] +AnyArrayLike: TypeAlias = Union[ArrayLike, "Index", "Series"] +TimeArrayLike: TypeAlias = Union["DatetimeArray", "TimedeltaArray"] # list-like @@ -152,31 +146,31 @@ def count(self, value: Any, /) -> int: ... def __reversed__(self) -> Iterator[_T_co]: ... -ListLike = Union[AnyArrayLike, SequenceNotStr, range] +ListLike: TypeAlias = AnyArrayLike | SequenceNotStr | range # scalars -PythonScalar = Union[str, float, bool] -DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"] -PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] -Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, date] -IntStrT = TypeVar("IntStrT", bound=Union[int, str]) - +PythonScalar: TypeAlias = str | float | bool +DatetimeLikeScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta"] +PandasScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta", "Interval"] +Scalar: TypeAlias = PythonScalar | PandasScalar | np.datetime64 | np.timedelta64 | date +IntStrT = TypeVar("IntStrT", bound=int | str) # timestamp and timedelta convertible types -TimestampConvertibleTypes = Union[ +TimestampConvertibleTypes: TypeAlias = Union[ "Timestamp", date, np.datetime64, np.int64, float, str ] -TimestampNonexistent = Union[ - Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta -] -TimedeltaConvertibleTypes = Union[ +TimestampNonexistent: TypeAlias = ( + Literal["shift_forward", "shift_backward", "NaT", "raise"] | timedelta +) + +TimedeltaConvertibleTypes: TypeAlias = Union[ "Timedelta", timedelta, np.timedelta64, np.int64, float, str ] -Timezone = Union[str, tzinfo] +Timezone: TypeAlias = str | tzinfo -ToTimestampHow = Literal["s", "e", "start", "end"] +ToTimestampHow: TypeAlias = Literal["s", "e", "start", "end"] # NDFrameT is stricter and ensures that the same subclass of NDFrame always is # used. E.g. `def func(a: NDFrameT) -> NDFrameT: ...` means that if a @@ -188,69 +182,66 @@ def __reversed__(self) -> Iterator[_T_co]: ... FreqIndexT = TypeVar("FreqIndexT", "DatetimeIndex", "PeriodIndex", "TimedeltaIndex") NumpyIndexT = TypeVar("NumpyIndexT", np.ndarray, "Index") -AxisInt = int -Axis = Union[AxisInt, Literal["index", "columns", "rows"]] -IndexLabel = Union[Hashable, Sequence[Hashable]] -Level = Hashable -Shape = tuple[int, ...] -Suffixes = Sequence[Optional[str]] -Ordered = Optional[bool] -JSONSerializable = Optional[Union[PythonScalar, list, dict]] -Frequency = Union[str, "BaseOffset"] -Axes = ListLike - -RandomState = Union[ - int, - np.ndarray, - np.random.Generator, - np.random.BitGenerator, - np.random.RandomState, -] +AxisInt: TypeAlias = int +Axis: TypeAlias = AxisInt | Literal["index", "columns", "rows"] +IndexLabel: TypeAlias = Hashable | Sequence[Hashable] +Level: TypeAlias = Hashable +Shape: TypeAlias = tuple[int, ...] +Suffixes: TypeAlias = Sequence[str | None] +Ordered: TypeAlias = bool | None +JSONSerializable: TypeAlias = PythonScalar | list | dict | None +Frequency: TypeAlias = Union[str, "BaseOffset"] +Axes: TypeAlias = ListLike + +RandomState: TypeAlias = ( + int + | np.ndarray + | np.random.Generator + | np.random.BitGenerator + | np.random.RandomState +) + # dtypes -NpDtype = Union[str, np.dtype, type_t[Union[str, complex, bool, object]]] -Dtype = Union["ExtensionDtype", NpDtype] -AstypeArg = Union["ExtensionDtype", "npt.DTypeLike"] +NpDtype: TypeAlias = str | np.dtype | type[str | complex | bool | object] +Dtype: TypeAlias = Union["ExtensionDtype", NpDtype] +AstypeArg: TypeAlias = Union["ExtensionDtype", npt.DTypeLike] # DtypeArg specifies all allowable dtypes in a functions its dtype argument -DtypeArg = Union[Dtype, Mapping[Hashable, Dtype]] -DtypeObj = Union[np.dtype, "ExtensionDtype"] +DtypeArg: TypeAlias = Dtype | Mapping[Hashable, Dtype] +DtypeObj: TypeAlias = Union[np.dtype, "ExtensionDtype"] # converters -ConvertersArg = dict[Hashable, Callable[[Dtype], Dtype]] +ConvertersArg: TypeAlias = dict[Hashable, Callable[[Dtype], Dtype]] # parse_dates -ParseDatesArg = Union[ - bool, list[Hashable], list[list[Hashable]], dict[Hashable, list[Hashable]] -] +ParseDatesArg: TypeAlias = ( + bool | list[Hashable] | list[list[Hashable]] | dict[Hashable, list[Hashable]] +) # For functions like rename that convert one label to another -Renamer = Union[Mapping[Any, Hashable], Callable[[Any], Hashable]] +Renamer: TypeAlias = Mapping[Any, Hashable] | Callable[[Any], Hashable] # to maintain type information across generic functions and parametrization T = TypeVar("T") # used in decorators to preserve the signature of the function it decorates # see https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators -FuncType = Callable[..., Any] +FuncType: TypeAlias = Callable[..., Any] F = TypeVar("F", bound=FuncType) TypeT = TypeVar("TypeT", bound=type) # types of vectorized key functions for DataFrame::sort_values and # DataFrame::sort_index, among others -ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]] -IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]] +ValueKeyFunc: TypeAlias = Callable[["Series"], Union["Series", AnyArrayLike]] | None +IndexKeyFunc: TypeAlias = Callable[["Index"], Union["Index", AnyArrayLike]] | None # types of `func` kwarg for DataFrame.aggregate and Series.aggregate -AggFuncTypeBase = Union[Callable, str] -AggFuncTypeDict = MutableMapping[ - Hashable, Union[AggFuncTypeBase, list[AggFuncTypeBase]] +AggFuncTypeBase: TypeAlias = Callable | str +AggFuncTypeDict: TypeAlias = MutableMapping[ + Hashable, AggFuncTypeBase | list[AggFuncTypeBase] ] -AggFuncType = Union[ - AggFuncTypeBase, - list[AggFuncTypeBase], - AggFuncTypeDict, -] -AggObjType = Union[ +AggFuncType: TypeAlias = AggFuncTypeBase | list[AggFuncTypeBase] | AggFuncTypeDict +AggObjType: TypeAlias = Union[ "Series", "DataFrame", "GroupBy", @@ -260,7 +251,7 @@ def __reversed__(self) -> Iterator[_T_co]: ... "Resampler", ] -PythonFuncType = Callable[[Any], Any] +PythonFuncType: TypeAlias = Callable[[Any], Any] # filenames and file-like-objects AnyStr_co = TypeVar("AnyStr_co", str, bytes, covariant=True) @@ -330,31 +321,30 @@ def closed(self) -> bool: ... -FilePath = Union[str, "PathLike[str]"] +FilePath: TypeAlias = str | PathLike[str] # for arbitrary kwargs passed during reading/writing files -StorageOptions = Optional[dict[str, Any]] - +StorageOptions: TypeAlias = dict[str, Any] | None # compression keywords and compression -CompressionDict = dict[str, Any] -CompressionOptions = Optional[ - Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], CompressionDict] -] +CompressionDict: TypeAlias = dict[str, Any] +CompressionOptions: TypeAlias = ( + Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"] | CompressionDict | None +) # types in DataFrameFormatter -FormattersType = Union[ - list[Callable], tuple[Callable, ...], Mapping[Union[str, int], Callable] -] -ColspaceType = Mapping[Hashable, Union[str, int]] -FloatFormatType = Union[str, Callable, "EngFormatter"] -ColspaceArgType = Union[ - str, int, Sequence[Union[str, int]], Mapping[Hashable, Union[str, int]] -] +FormattersType: TypeAlias = ( + list[Callable] | tuple[Callable, ...] | Mapping[str | int, Callable] +) +ColspaceType: TypeAlias = Mapping[Hashable, str | int] +FloatFormatType: TypeAlias = Union[str, Callable, "EngFormatter"] +ColspaceArgType: TypeAlias = ( + str | int | Sequence[str | int] | Mapping[Hashable, str | int] +) # Arguments for fillna() -FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"] -InterpolateOptions = Literal[ +FillnaOptions: TypeAlias = Literal["backfill", "bfill", "ffill", "pad"] +InterpolateOptions: TypeAlias = Literal[ "linear", "time", "index", @@ -376,7 +366,7 @@ def closed(self) -> bool: ] # internals -Manager = Union["BlockManager", "SingleBlockManager"] +Manager: TypeAlias = Union["BlockManager", "SingleBlockManager"] # indexing # PositionalIndexer -> valid 1D positional indexer, e.g. can pass @@ -389,63 +379,62 @@ def closed(self) -> bool: # https://github.com/python/typing/issues/684#issuecomment-548203158 # https://bugs.python.org/issue41810 # Using List[int] here rather than Sequence[int] to disallow tuples. -ScalarIndexer = Union[int, np.integer] -SequenceIndexer = Union[slice, list[int], np.ndarray] -PositionalIndexer = Union[ScalarIndexer, SequenceIndexer] -PositionalIndexerTuple = tuple[PositionalIndexer, PositionalIndexer] -PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple] -if TYPE_CHECKING: - TakeIndexer = Union[Sequence[int], Sequence[np.integer], npt.NDArray[np.integer]] -else: - TakeIndexer = Any +ScalarIndexer: TypeAlias = int | np.integer +SequenceIndexer: TypeAlias = slice | list[int] | np.ndarray +PositionalIndexer: TypeAlias = ScalarIndexer | SequenceIndexer +PositionalIndexerTuple: TypeAlias = tuple[PositionalIndexer, PositionalIndexer] +PositionalIndexer2D: TypeAlias = PositionalIndexer | PositionalIndexerTuple +TakeIndexer: TypeAlias = Sequence[int] | Sequence[np.integer] | npt.NDArray[np.integer] # Shared by functions such as drop and astype -IgnoreRaise = Literal["ignore", "raise"] +IgnoreRaise: TypeAlias = Literal["ignore", "raise"] # Windowing rank methods -WindowingRankType = Literal["average", "min", "max"] +WindowingRankType: TypeAlias = Literal["average", "min", "max"] # read_csv engines -CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"] +CSVEngine: TypeAlias = Literal["c", "python", "pyarrow", "python-fwf"] # read_json engines -JSONEngine = Literal["ujson", "pyarrow"] +JSONEngine: TypeAlias = Literal["ujson", "pyarrow"] # read_xml parsers -XMLParsers = Literal["lxml", "etree"] +XMLParsers: TypeAlias = Literal["lxml", "etree"] # read_html flavors -HTMLFlavors = Literal["lxml", "html5lib", "bs4"] +HTMLFlavors: TypeAlias = Literal["lxml", "html5lib", "bs4"] # Interval closed type -IntervalLeftRight = Literal["left", "right"] -IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]] +IntervalLeftRight: TypeAlias = Literal["left", "right"] +IntervalClosedType: TypeAlias = IntervalLeftRight | Literal["both", "neither"] # datetime and NaTType -DatetimeNaTType = Union[datetime, "NaTType"] -DateTimeErrorChoices = Literal["raise", "coerce"] +DatetimeNaTType: TypeAlias = Union[datetime, "NaTType"] +DateTimeErrorChoices: TypeAlias = Literal["raise", "coerce"] # sort_index -SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"] -NaPosition = Literal["first", "last"] +SortKind: TypeAlias = Literal["quicksort", "mergesort", "heapsort", "stable"] +NaPosition: TypeAlias = Literal["first", "last"] # Arguments for nsmallest and nlargest -NsmallestNlargestKeep = Literal["first", "last", "all"] +NsmallestNlargestKeep: TypeAlias = Literal["first", "last", "all"] # quantile interpolation -QuantileInterpolation = Literal["linear", "lower", "higher", "midpoint", "nearest"] +QuantileInterpolation: TypeAlias = Literal[ + "linear", "lower", "higher", "midpoint", "nearest" +] # plotting -PlottingOrientation = Literal["horizontal", "vertical"] +PlottingOrientation: TypeAlias = Literal["horizontal", "vertical"] # dropna -AnyAll = Literal["any", "all"] +AnyAll: TypeAlias = Literal["any", "all"] # merge -MergeHow = Literal[ +MergeHow: TypeAlias = Literal[ "left", "right", "inner", "outer", "cross", "left_anti", "right_anti" ] -MergeValidate = Literal[ +MergeValidate: TypeAlias = Literal[ "one_to_one", "1:1", "one_to_many", @@ -457,8 +446,8 @@ def closed(self) -> bool: ] # join -JoinHow = Literal["left", "right", "inner", "outer"] -JoinValidate = Literal[ +JoinHow: TypeAlias = Literal["left", "right", "inner", "outer"] +JoinValidate: TypeAlias = Literal[ "one_to_one", "1:1", "one_to_many", @@ -470,25 +459,28 @@ def closed(self) -> bool: ] # reindex -ReindexMethod = Union[FillnaOptions, Literal["nearest"]] +ReindexMethod: TypeAlias = FillnaOptions | Literal["nearest"] -MatplotlibColor = Union[str, Sequence[float]] -TimeGrouperOrigin = Union[ +MatplotlibColor: TypeAlias = str | Sequence[float] +TimeGrouperOrigin: TypeAlias = Union[ "Timestamp", Literal["epoch", "start", "start_day", "end", "end_day"] ] -TimeAmbiguous = Union[Literal["infer", "NaT", "raise"], "npt.NDArray[np.bool_]"] -TimeNonexistent = Union[ - Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta -] -DropKeep = Literal["first", "last", False] -CorrelationMethod = Union[ - Literal["pearson", "kendall", "spearman"], Callable[[np.ndarray, np.ndarray], float] -] -AlignJoin = Literal["outer", "inner", "left", "right"] -DtypeBackend = Literal["pyarrow", "numpy_nullable"] +TimeAmbiguous: TypeAlias = Literal["infer", "NaT", "raise"] | npt.NDArray[np.bool_] +TimeNonexistent: TypeAlias = ( + Literal["shift_forward", "shift_backward", "NaT", "raise"] | timedelta +) + +DropKeep: TypeAlias = Literal["first", "last", False] +CorrelationMethod: TypeAlias = ( + Literal["pearson", "kendall", "spearman"] + | Callable[[np.ndarray, np.ndarray], float] +) -TimeUnit = Literal["s", "ms", "us", "ns"] -OpenFileErrors = Literal[ +AlignJoin: TypeAlias = Literal["outer", "inner", "left", "right"] +DtypeBackend: TypeAlias = Literal["pyarrow", "numpy_nullable"] + +TimeUnit: TypeAlias = Literal["s", "ms", "us", "ns"] +OpenFileErrors: TypeAlias = Literal[ "strict", "ignore", "replace", @@ -499,34 +491,32 @@ def closed(self) -> bool: ] # update -UpdateJoin = Literal["left"] +UpdateJoin: TypeAlias = Literal["left"] # applymap -NaAction = Literal["ignore"] +NaAction: TypeAlias = Literal["ignore"] # from_dict -FromDictOrient = Literal["columns", "index", "tight"] +FromDictOrient: TypeAlias = Literal["columns", "index", "tight"] # to_stata -ToStataByteorder = Literal[">", "<", "little", "big"] +ToStataByteorder: TypeAlias = Literal[">", "<", "little", "big"] # ExcelWriter -ExcelWriterIfSheetExists = Literal["error", "new", "replace", "overlay"] -ExcelWriterMergeCells = Union[bool, Literal["columns"]] +ExcelWriterIfSheetExists: TypeAlias = Literal["error", "new", "replace", "overlay"] +ExcelWriterMergeCells: TypeAlias = bool | Literal["columns"] # Offsets -OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] +OffsetCalendar: TypeAlias = Union[np.busdaycalendar, "AbstractHolidayCalendar"] # read_csv: usecols -UsecolsArgType = Union[ - SequenceNotStr[Hashable], - range, - AnyArrayLike, - Callable[[HashableT], bool], - None, -] +UsecolsArgType: TypeAlias = ( + SequenceNotStr[Hashable] | range | AnyArrayLike | Callable[[HashableT], bool] | None +) # maintain the sub-type of any hashable sequence SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable]) -SliceType = Optional[Hashable] +SliceType: TypeAlias = Hashable | None + +__all__ = ["type_t"] diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6b90389a62056..f01dfab0de829 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -22,38 +22,38 @@ VERSIONS = { "adbc-driver-postgresql": "0.10.0", "adbc-driver-sqlite": "0.8.0", - "bs4": "4.11.2", - "blosc": "1.21.3", + "bs4": "4.12.3", "bottleneck": "1.3.6", - "fastparquet": "2023.10.0", - "fsspec": "2022.11.0", + "fastparquet": "2024.2.0", + "fsspec": "2023.12.2", "html5lib": "1.1", "hypothesis": "6.84.0", - "gcsfs": "2022.11.0", - "jinja2": "3.1.2", + "gcsfs": "2023.12.2", + "jinja2": "3.1.3", "lxml.etree": "4.9.2", - "matplotlib": "3.6.3", - "numba": "0.56.4", - "numexpr": "2.8.4", + "matplotlib": "3.8.3", + "numba": "0.59.0", + "numexpr": "2.9.0", "odfpy": "1.4.1", - "openpyxl": "3.1.0", + "openpyxl": "3.1.2", "psycopg2": "2.9.6", # (dt dec pq3 ext lo64) - "pymysql": "1.0.2", + "pymysql": "1.1.0", "pyarrow": "10.0.1", - "pyreadstat": "1.2.0", + "pyiceberg": "0.7.1", + "pyreadstat": "1.2.6", "pytest": "7.3.2", "python-calamine": "0.1.7", "pytz": "2023.4", "pyxlsb": "1.0.10", - "s3fs": "2022.11.0", - "scipy": "1.10.0", + "s3fs": "2023.12.2", + "scipy": "1.12.0", "sqlalchemy": "2.0.0", "tables": "3.8.0", "tabulate": "0.9.0", - "xarray": "2022.12.0", + "xarray": "2024.1.1", "xlrd": "2.0.1", - "xlsxwriter": "3.0.5", - "zstandard": "0.19.0", + "xlsxwriter": "3.2.0", + "zstandard": "0.22.0", "tzdata": "2022.7", "qtpy": "2.3.0", "pyqt5": "5.15.9", diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 3306b36d71806..e95b44c879940 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -36,8 +36,8 @@ r".*In the future `np\.long` will be defined as.*", FutureWarning, ) - np_long = np.long # type: ignore[attr-defined] - np_ulong = np.ulong # type: ignore[attr-defined] + np_long = np.long + np_ulong = np.ulong except AttributeError: np_long = np.int_ np_ulong = np.uint diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 68aa1446bbe3c..c03f20c871012 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -9,7 +9,10 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import numba import numpy as np @@ -18,6 +21,20 @@ from pandas._typing import npt +@numba.njit(nogil=True, parallel=False) +def bisect_left(a: list[Any], x: Any, lo: int = 0, hi: int = -1) -> int: + """Same as https://docs.python.org/3/library/bisect.html; not in numba yet!""" + if hi == -1: + hi = len(a) + while lo < hi: + mid = (lo + hi) // 2 + if a[mid] < x: + lo = mid + 1 + else: + hi = mid + return lo + + @numba.jit(nopython=True, nogil=True, parallel=False) def sliding_min_max( values: np.ndarray, @@ -27,55 +44,87 @@ def sliding_min_max( min_periods: int, is_max: bool, ) -> tuple[np.ndarray, list[int]]: + # Basic idea of the algorithm: https://stackoverflow.com/a/12239580 + # It was generalized to work with an arbitrary list of any window size and position + # by adding the Dominators stack. + N = len(start) - nobs = 0 - output = np.empty(N, dtype=result_dtype) na_pos = [] - # Use deque once numba supports it - # https://github.com/numba/numba/issues/7417 - Q: list = [] - W: list = [] - for i in range(N): - curr_win_size = end[i] - start[i] - if i == 0: - st = start[i] - else: - st = end[i - 1] - - for k in range(st, end[i]): - ai = values[k] - if not np.isnan(ai): - nobs += 1 - elif is_max: - ai = -np.inf - else: - ai = np.inf - # Discard previous entries if we find new min or max - if is_max: - while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]): - Q.pop() - else: - while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]): - Q.pop() - Q.append(k) - W.append(k) - - # Discard entries outside and left of current window - while Q and Q[0] <= start[i] - 1: - Q.pop(0) - while W and W[0] <= start[i] - 1: - if not np.isnan(values[W[0]]): - nobs -= 1 - W.pop(0) - - # Save output based on index in input value array - if Q and curr_win_size > 0 and nobs >= min_periods: - output[i] = values[Q[0]] + output = np.empty(N, dtype=result_dtype) + + def cmp(a: Any, b: Any, is_max: bool) -> bool: + if is_max: + return a >= b else: + return a <= b + + # Indices of bounded extrema in `values`. `candidates[i]` is always increasing. + # `values[candidates[i]]` is decreasing for max and increasing for min. + candidates: list[int] = [] # this is a queue + # Indices of largest windows that "cover" preceding windows. + dominators: list[int] = [] # this is a stack + + if min_periods < 1: + min_periods = 1 + + if N > 2: + i_next = N - 1 # equivalent to i_next = i+1 inside the loop + for i in range(N - 2, -1, -1): + next_dominates = start[i_next] < start[i] + if next_dominates and ( + not dominators or start[dominators[-1]] > start[i_next] + ): + dominators.append(i_next) + i_next = i + + # NaN tracking to guarantee min_periods + valid_start = -min_periods + + last_end = 0 + last_start = -1 + + for i in range(N): + this_start = start[i].item() + this_end = end[i].item() + + if dominators and dominators[-1] == i: + dominators.pop() + + if not ( + this_end > last_end or (this_end == last_end and this_start >= last_start) + ): + raise ValueError( + "Start/End ordering requirement is violated at index " + str(i) + ) + + stash_start = ( + this_start if not dominators else min(this_start, start[dominators[-1]]) + ) + while candidates and candidates[0] < stash_start: + candidates.pop(0) + + for k in range(last_end, this_end): + if not np.isnan(values[k]): + valid_start += 1 + while valid_start >= 0 and np.isnan(values[valid_start]): + valid_start += 1 + while candidates and cmp(values[k], values[candidates[-1]], is_max): + candidates.pop() # Q.pop_back() + candidates.append(k) # Q.push_back(k) + + if not candidates or (this_start > valid_start): if values.dtype.kind != "i": output[i] = np.nan else: na_pos.append(i) + elif candidates[0] >= this_start: + # ^^ This is here to avoid costly bisection for fixed window sizes. + output[i] = values[candidates[0]] + else: + q_idx = bisect_left(candidates, this_start, lo=1) + output[i] = values[candidates[q_idx]] + last_end = this_end + last_start = this_start return output, na_pos diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 78684eacf2d66..0331c26c805b6 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -351,7 +351,7 @@ def register_dataframe_accessor(name: str) -> Callable[[TypeT], TypeT]: AttributeError: The series must contain integer data only. >>> df = pd.Series([1, 2, 3]) >>> df.int_accessor.sum() -6""" +np.int64(6)""" @doc(_register_accessor, klass="Series", examples=_register_series_examples) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 76f2fdad591ff..7fc391d3ffb51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -47,6 +47,7 @@ is_bool_dtype, is_complex_dtype, is_dict_like, + is_dtype_equal, is_extension_array_dtype, is_float, is_float_dtype, @@ -215,7 +216,7 @@ def _reconstruct_data( values = cls._from_sequence(values, dtype=dtype) # type: ignore[assignment] else: - values = values.astype(dtype, copy=False) + values = values.astype(dtype, copy=False) # type: ignore[assignment] return values @@ -511,6 +512,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: len(values) > 0 and values.dtype.kind in "iufcb" and not is_signed_integer_dtype(comps) + and not is_dtype_equal(values, comps) ): # GH#46485 Use object to avoid upcast to float64 later # TODO: Share with _find_common_type_compat diff --git a/pandas/core/apply.py b/pandas/core/apply.py index da6124307e3f1..2c96f1ef020ac 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -327,7 +327,7 @@ def transform(self) -> DataFrame | Series: if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: - func = {col: func for col in obj} + func = dict.fromkeys(obj, func) if is_dict_like(func): func = cast(AggFuncTypeDict, func) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 8a920d1849bb3..eb5026454552c 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -102,7 +102,7 @@ def quantile_with_mask( interpolation=interpolation, ) - result = np.asarray(result) + result = np.asarray(result) # type: ignore[assignment] result = result.T return result @@ -196,7 +196,7 @@ def _nanquantile( # Caller is responsible for ensuring mask shape match assert mask.shape == values.shape result = [ - _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) + _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) # type: ignore[arg-type] for (val, m) in zip(list(values), list(mask)) ] if values.dtype.kind == "f": diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 4e6f20e6ad3dd..26585e7bab8e3 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -142,18 +142,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: dt64_values = arr.view(dtype) return DatetimeArray._simple_new(dt64_values, dtype=dtype) - elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): from pandas.core.arrays import TimedeltaArray td64_values = arr.view(dtype) return TimedeltaArray._simple_new(td64_values, dtype=dtype) - - # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible - # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None, - # type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, - # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" - return arr.view(dtype=dtype) # type: ignore[arg-type] + return arr.view(dtype=dtype) def take( self, diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index 285c3fd465ffc..7da83e2257e30 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -44,7 +44,7 @@ def pyarrow_array_to_numpy_and_mask( mask = pyarrow.BooleanArray.from_buffers( pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset ) - mask = np.asarray(mask) + mask = np.asarray(mask) # type: ignore[assignment] else: mask = np.ones(len(arr), dtype=bool) return data, mask diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9295cf7873d98..0b90bcea35100 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -33,7 +33,6 @@ infer_dtype_from_scalar, ) from pandas.core.dtypes.common import ( - CategoricalDtype, is_array_like, is_bool_dtype, is_float_dtype, @@ -730,9 +729,7 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op) -> ArrowExtensionArray: pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance( - other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) - ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): + if isinstance(other, (ExtensionArray, np.ndarray, list)): try: result = pc_func(self._pa_array, self._box_pa(other)) except pa.ArrowNotImplementedError: @@ -2540,7 +2537,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): dummies_dtype = np.bool_ dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) dummies[indices] = True - dummies = dummies.reshape((n_rows, n_cols)) + dummies = dummies.reshape((n_rows, n_cols)) # type: ignore[assignment] result = type(self)(pa.array(list(dummies))) return result, uniques_sorted.to_pylist() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index dad38abccf4ee..d0048e122051a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -596,7 +596,7 @@ def to_numpy( if copy or na_value is not lib.no_default: result = result.copy() if na_value is not lib.no_default: - result[self.isna()] = na_value + result[self.isna()] = na_value # type: ignore[index] return result # ------------------------------------------------------------------------ @@ -941,7 +941,7 @@ def argmin(self, skipna: bool = True) -> int: -------- >>> arr = pd.array([3, 1, 2, 5, 4]) >>> arr.argmin() - 1 + np.int64(1) """ # Implementer note: You have two places to override the behavior of # argmin. @@ -975,7 +975,7 @@ def argmax(self, skipna: bool = True) -> int: -------- >>> arr = pd.array([3, 1, 2, 5, 4]) >>> arr.argmax() - 3 + np.int64(3) """ # Implementer note: You have two places to override the behavior of # argmax. @@ -1959,10 +1959,10 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: -------- >>> class MyExtensionArray(pd.arrays.NumpyExtensionArray): ... def _formatter(self, boxed=False): - ... return lambda x: "*" + str(x) + "*" if boxed else repr(x) + "*" + ... return lambda x: "*" + str(x) + "*" >>> MyExtensionArray(np.array([1, 2, 3, 4])) - [1*, 2*, 3*, 4*] + [*1*, *2*, *3*, *4*] Length: 4, dtype: int64 """ if boxed: @@ -2176,15 +2176,15 @@ def _reduce( Examples -------- >>> pd.array([1, 2, 3])._reduce("min") - 1 + np.int64(1) >>> pd.array([1, 2, 3])._reduce("max") - 3 + np.int64(3) >>> pd.array([1, 2, 3])._reduce("sum") - 6 + np.int64(6) >>> pd.array([1, 2, 3])._reduce("mean") - 2.0 + np.float64(2.0) >>> pd.array([1, 2, 3])._reduce("median") - 2.0 + np.float64(2.0) """ meth = getattr(self, name, None) if meth is None: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 647530151d5f6..3d2ad109a55ba 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -452,7 +452,7 @@ def __init__( if isinstance(values, Index): arr = values._data._pa_array.combine_chunks() else: - arr = values._pa_array.combine_chunks() + arr = extract_array(values)._pa_array.combine_chunks() categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) codes = arr.indices.to_numpy() dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) @@ -1666,7 +1666,7 @@ def __array__( Parameters ---------- dtype : np.dtype or None - Specifies the the dtype for the array. + Specifies the dtype for the array. copy : bool or None, optional See :func:`numpy.asarray`. @@ -1853,7 +1853,7 @@ def value_counts(self, dropna: bool = True) -> Series: count = np.bincount(obs, minlength=ncat or 0) else: count = np.bincount(np.where(mask, code, ncat)) - ix = np.append(ix, -1) + ix = np.append(ix, -1) # type: ignore[assignment] ix = coerce_indexer_dtype(ix, self.dtype.categories) ix_categorical = self._from_backing_data(ix) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index eba738c926497..994d7b1d0081c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -275,7 +275,7 @@ def _unbox_scalar( -------- >>> arr = pd.array(np.array(["1970-01-01"], "datetime64[ns]")) >>> arr._unbox_scalar(arr[0]) - numpy.datetime64('1970-01-01T00:00:00.000000000') + np.datetime64('1970-01-01T00:00:00.000000000') """ raise AbstractMethodError(self) @@ -2394,7 +2394,7 @@ def take( ) indices = np.asarray(indices, dtype=np.intp) - maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) # type: ignore[arg-type] if isinstance(maybe_slice, slice): freq = self._get_getitem_freq(maybe_slice) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index df40c9c11b117..b31c543188282 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -331,7 +331,7 @@ def _simple_new( # type: ignore[override] else: # DatetimeTZDtype. If we have e.g. DatetimeTZDtype[us, UTC], # then values.dtype should be M8[us]. - assert dtype._creso == get_unit_from_dtype(values.dtype) + assert dtype._creso == get_unit_from_dtype(values.dtype) # type: ignore[union-attr] result = super()._simple_new(values, dtype) result._freq = freq @@ -542,7 +542,7 @@ def _unbox_scalar(self, value) -> np.datetime64: raise ValueError("'value' should be a Timestamp.") self._check_compatible_with(value) if value is NaT: - return np.datetime64(value._value, self.unit) + return np.datetime64(value._value, self.unit) # type: ignore[call-overload] else: return value.as_unit(self.unit, round_ok=False).asm8 @@ -813,10 +813,7 @@ def _add_offset(self, offset: BaseOffset) -> Self: try: res_values = offset._apply_array(values._ndarray) if res_values.dtype.kind == "i": - # error: Argument 1 to "view" of "ndarray" has incompatible type - # "dtype[datetime64] | DatetimeTZDtype"; expected - # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]" - res_values = res_values.view(values.dtype) # type: ignore[arg-type] + res_values = res_values.view(values.dtype) except NotImplementedError: if get_option("performance_warnings"): warnings.warn( diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 0bf2089df5f85..6cb79e915c78b 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1775,7 +1775,8 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray: [(0, 1], (1, 2]] Length: 2, dtype: interval[int64, right] >>> idx.to_tuples() - array([(0, 1), (1, 2)], dtype=object) + array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))], + dtype=object) For :class:`pandas.IntervalIndex`: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 708a3818bcbb7..e7a6b207363c3 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1378,25 +1378,25 @@ def any( skips NAs): >>> pd.array([True, False, True]).any() - True + np.True_ >>> pd.array([True, False, pd.NA]).any() - True + np.True_ >>> pd.array([False, False, pd.NA]).any() - False + np.False_ >>> pd.array([], dtype="boolean").any() - False + np.False_ >>> pd.array([pd.NA], dtype="boolean").any() - False + np.False_ >>> pd.array([pd.NA], dtype="Float64").any() - False + np.False_ With ``skipna=False``, the result can be NA if this is logically required (whether ``pd.NA`` is True or False influences the result): >>> pd.array([True, False, pd.NA]).any(skipna=False) - True + np.True_ >>> pd.array([1, 0, pd.NA]).any(skipna=False) - True + np.True_ >>> pd.array([False, False, pd.NA]).any(skipna=False) >>> pd.array([0, 0, pd.NA]).any(skipna=False) @@ -1466,17 +1466,17 @@ def all( skips NAs): >>> pd.array([True, True, pd.NA]).all() - True + np.True_ >>> pd.array([1, 1, pd.NA]).all() - True + np.True_ >>> pd.array([True, False, pd.NA]).all() - False + np.False_ >>> pd.array([], dtype="boolean").all() - True + np.True_ >>> pd.array([pd.NA], dtype="boolean").all() - True + np.True_ >>> pd.array([pd.NA], dtype="Float64").all() - True + np.True_ With ``skipna=False``, the result can be NA if this is logically required (whether ``pd.NA`` is True or False influences the result): @@ -1486,9 +1486,9 @@ def all( >>> pd.array([1, 1, pd.NA]).all(skipna=False) >>> pd.array([True, False, pd.NA]).all(skipna=False) - False + np.False_ >>> pd.array([1, 0, pd.NA]).all(skipna=False) - False + np.False_ """ nv.validate_all((), kwargs) @@ -1497,10 +1497,10 @@ def all( result = values.all(axis=axis) if skipna: - return result + return result # type: ignore[return-value] else: if not result or len(self) == 0 or not self._mask.any(): - return result + return result # type: ignore[return-value] else: return self.dtype.na_value diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index eab8527eef526..7dde03b30cd6a 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -297,7 +297,7 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): -------- >>> df = pd.DataFrame({"a": [1, 2, 0, 0], "b": [3, 0, 0, 4]}, dtype="Sparse[int]") >>> df.sparse.density - 0.5 + np.float64(0.5) """ def _validate(self, data) -> None: @@ -459,7 +459,7 @@ def density(self) -> float: -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.density - 0.5 + np.float64(0.5) """ tmp = np.mean([column.array.density for _, column in self._parent.items()]) return tmp diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index cc9fd2d5fb8b0..d4ef3003583c3 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -79,7 +79,7 @@ def _levels_to_axis( ax_coords = codes[valid_ilocs] ax_labels = ax_labels.tolist() - return ax_coords, ax_labels + return ax_coords, ax_labels # pyright: ignore[reportReturnType] def _to_ijv( diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7227ea77ca433..8048306df91a2 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -123,10 +123,10 @@ class StringDtype(StorageExtensionDtype): Examples -------- >>> pd.StringDtype() - string[python] + )> >>> pd.StringDtype(storage="pyarrow") - string[pyarrow] + )> """ @property @@ -198,11 +198,8 @@ def __init__( self._na_value = na_value def __repr__(self) -> str: - if self._na_value is libmissing.NA: - return f"{self.name}[{self.storage}]" - else: - # TODO add more informative repr - return self.name + storage = "" if self.storage == "pyarrow" else "storage='python', " + return f"" def __eq__(self, other: object) -> bool: # we need to override the base class __eq__ because na_value (NA or NaN) @@ -1018,7 +1015,30 @@ def searchsorted( return super().searchsorted(value=value, side=side, sorter=sorter) def _cmp_method(self, other, op): - from pandas.arrays import BooleanArray + from pandas.arrays import ( + ArrowExtensionArray, + BooleanArray, + ) + + if ( + isinstance(other, BaseStringArray) + and self.dtype.na_value is not libmissing.NA + and other.dtype.na_value is libmissing.NA + ): + # NA has priority of NaN semantics + return NotImplemented + + if isinstance(other, ArrowExtensionArray): + if isinstance(other, BaseStringArray): + # pyarrow storage has priority over python storage + # (except if we have NA semantics and other not) + if not ( + self.dtype.na_value is libmissing.NA + and other.dtype.na_value is not libmissing.NA + ): + return NotImplemented + else: + return NotImplemented if isinstance(other, StringArray): other = other._ndarray diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d35083fd892a8..9668981df827b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -281,7 +281,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: ] # short-circuit to return all False array. - if not len(value_set): + if not value_set: return np.zeros(len(self), dtype=bool) result = pc.is_in( @@ -473,6 +473,14 @@ def value_counts(self, dropna: bool = True) -> Series: return result def _cmp_method(self, other, op): + if ( + isinstance(other, (BaseStringArray, ArrowExtensionArray)) + and self.dtype.na_value is not libmissing.NA + and other.dtype.na_value is libmissing.NA + ): + # NA has priority of NaN semantics + return NotImplemented + result = super()._cmp_method(other, op) if self.dtype.na_value is np.nan: if op == operator.ne: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index c5b3129c506c8..9012b9f36348a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -325,7 +325,7 @@ def _unbox_scalar(self, value) -> np.timedelta64: raise ValueError("'value' should be a Timedelta.") self._check_compatible_with(value) if value is NaT: - return np.timedelta64(value._value, self.unit) + return np.timedelta64(value._value, self.unit) # type: ignore[call-overload] else: return value.as_unit(self.unit, round_ok=False).asm8 diff --git a/pandas/core/base.py b/pandas/core/base.py index a64cd8633c1db..6cc28d4e46634 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -804,9 +804,9 @@ def argmax( dtype: float64 >>> s.argmax() - 2 + np.int64(2) >>> s.argmin() - 0 + np.int64(0) The maximum cereal calories is the third element and the minimum cereal calories is the first element, @@ -1360,7 +1360,7 @@ def factorize( dtype: int64 >>> ser.searchsorted(4) - 3 + np.int64(3) >>> ser.searchsorted([0, 4]) array([0, 3]) @@ -1379,7 +1379,7 @@ def factorize( dtype: datetime64[s] >>> ser.searchsorted('3/14/2000') - 3 + np.int64(3) >>> ser = pd.Categorical( ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True @@ -1389,7 +1389,7 @@ def factorize( Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk'] >>> ser.searchsorted('bread') - 1 + np.int64(1) >>> ser.searchsorted(['bread'], side='right') array([3]) @@ -1480,9 +1480,9 @@ def _arith_method(self, other, op): with np.errstate(all="ignore"): result = ops.arithmetic_op(lvalues, rvalues, op) - return self._construct_result(result, name=res_name) + return self._construct_result(result, name=res_name, other=other) - def _construct_result(self, result, name): + def _construct_result(self, result, name, other): """ Construct an appropriately-wrapped result from the ArrayLike result of an arithmetic-like operation. diff --git a/pandas/core/common.py b/pandas/core/common.py index 100ad312bd839..75f8a56aac5db 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -246,7 +246,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi with warnings.catch_warnings(): # Can remove warning filter once NumPy 1.24 is min version if not np_version_gte1p24: - warnings.simplefilter("ignore", np.VisibleDeprecationWarning) + # np.VisibleDeprecationWarning only in np.exceptions in 2.0 + warnings.simplefilter("ignore", np.VisibleDeprecationWarning) # type: ignore[attr-defined] result = np.asarray(values, dtype=dtype) except ValueError: # Using try/except since it's more performant than checking is_list_like diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 14a393b02409c..b53596fe28e70 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -644,7 +644,11 @@ def visit_Attribute(self, node, **kwargs): ctx = node.ctx if isinstance(ctx, ast.Load): # resolve the value - resolved = self.visit(value).value + visited_value = self.visit(value) + if hasattr(visited_value, "value"): + resolved = visited_value.value + else: + resolved = visited_value(self.env) try: v = getattr(resolved, attr) name = self.env.add_tmp(v) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 166c9d47294cd..77b7d9ad11a6c 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -239,7 +239,8 @@ def stringify(value): if conv_val not in metadata: result = -1 else: - result = metadata.searchsorted(conv_val, side="left") + # Find the index of the first match of conv_val in metadata + result = np.flatnonzero(metadata == conv_val)[0] return TermValue(result, result, "integer") elif kind == "integer": try: diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d8a42d83b6c54..428fc24cd08ac 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -45,6 +45,11 @@ class ExtensionDtype: """ A custom data type, to be paired with an ExtensionArray. + This enables support for third-party and custom dtypes within the + pandas ecosystem. By implementing this interface and pairing it with a custom + `ExtensionArray`, users can create rich data types that integrate cleanly + with pandas operations, such as grouping, joining, or aggregation. + See Also -------- extensions.register_extension_dtype: Register an ExtensionType diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e92f2363b69f1..68d99937f728c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -655,24 +655,38 @@ def is_dtype_equal(source, target) -> bool: Parameters ---------- - source : The first dtype to compare - target : The second dtype to compare + source : type or str + The first dtype to compare. + target : type or str + The second dtype to compare. Returns ------- boolean Whether or not the two dtypes are equal. + See Also + -------- + api.types.is_categorical_dtype : Check whether the provided array or dtype + is of the Categorical dtype. + api.types.is_string_dtype : Check whether the provided array or dtype + is of the string dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + Examples -------- + >>> from pandas.api.types import is_dtype_equal >>> is_dtype_equal(int, float) False >>> is_dtype_equal("int", int) True >>> is_dtype_equal(object, "category") False + >>> from pandas.core.dtypes.dtypes import CategoricalDtype >>> is_dtype_equal(CategoricalDtype(), "category") True + >>> from pandas.core.dtypes.dtypes import DatetimeTZDtype >>> is_dtype_equal(DatetimeTZDtype(tz="UTC"), "datetime64") False """ diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f20ca44728664..71fe0f6e4feb0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -428,9 +428,9 @@ def array_equivalent( Examples -------- >>> array_equivalent(np.array([1, 2, np.nan]), np.array([1, 2, np.nan])) - True + np.True_ >>> array_equivalent(np.array([1, np.nan, 2]), np.array([1, 2, np.nan])) - False + np.False_ """ left, right = np.asarray(left), np.asarray(right) @@ -626,7 +626,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): >>> na_value_for_dtype(np.dtype("bool")) False >>> na_value_for_dtype(np.dtype("datetime64[ns]")) - numpy.datetime64('NaT') + np.datetime64('NaT') """ if isinstance(dtype, ExtensionDtype): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8f65277f660f7..ea7c1afdd036b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8,13 +8,11 @@ alignment and a host of useful data manipulation methods having to do with the labeling information """ - from __future__ import annotations import collections from collections import abc from collections.abc import ( - Callable, Hashable, Iterable, Iterator, @@ -22,6 +20,7 @@ Sequence, ) import functools +from inspect import signature from io import StringIO import itertools import operator @@ -30,6 +29,7 @@ from typing import ( TYPE_CHECKING, Any, + Callable, Literal, cast, overload, @@ -39,7 +39,12 @@ import numpy as np from numpy import ma -from pandas._config import get_option +from pandas._config import ( + get_option, + using_copy_on_write, + warn_copy_on_write, +) +from pandas._config.config import _get_option from pandas._libs import ( algos as libalgos, @@ -55,17 +60,16 @@ from pandas.errors import ( ChainedAssignmentError, InvalidIndexError, -) -from pandas.errors.cow import ( _chained_assignment_method_msg, _chained_assignment_msg, + _chained_assignment_warning_method_msg, + _chained_assignment_warning_msg, ) from pandas.util._decorators import ( Appender, Substitution, deprecate_nonkeyword_arguments, doc, - set_module, ) from pandas.util._exceptions import ( find_stack_level, @@ -85,6 +89,7 @@ find_common_type, infer_dtype_from_scalar, invalidate_string_dtypes, + maybe_box_native, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -124,7 +129,7 @@ ops, roperator, ) -from pandas.core.accessor import Accessor +from pandas.core.accessor import CachedAccessor from pandas.core.apply import reconstruct_and_relabel_result from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin @@ -162,11 +167,15 @@ check_bool_indexer, check_dict_or_set_indexers, ) -from pandas.core.internals import BlockManager +from pandas.core.internals import ( + ArrayManager, + BlockManager, +) from pandas.core.internals.construction import ( arrays_to_mgr, dataclasses_to_dicts, dict_to_mgr, + mgr_to_mgr, ndarray_to_mgr, nested_data_to_arrays, rec_array_to_mgr, @@ -219,17 +228,15 @@ FormattersType, Frequency, FromDictOrient, - HashableT, - HashableT2, IgnoreRaise, IndexKeyFunc, IndexLabel, JoinValidate, Level, - ListLike, MergeHow, MergeValidate, MutableMappingT, + NaAction, NaPosition, NsmallestNlargestKeep, PythonFuncType, @@ -243,7 +250,7 @@ SortKind, StorageOptions, Suffixes, - T, + ToGbqIfexist, ToStataByteorder, ToTimestampHow, UpdateJoin, @@ -255,7 +262,7 @@ from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg - from pandas.core.internals.managers import SingleBlockManager + from pandas.core.internals import SingleDataManager from pandas.io.formats.style import Styler @@ -315,8 +322,7 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, - default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -329,10 +335,6 @@ join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. - * left_anti: use only keys from left frame that are not in right frame, similar - to SQL left anti join; preserve key order. - * right_anti: use only keys from right frame that are not in left frame, similar - to SQL right anti join; preserve key order. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -362,7 +364,7 @@ of a string to indicate that the column name from `left` or `right` should be left as-is, with no suffix. At least one of the values must not be None. -copy : bool, default False +copy : bool, default True If False, avoid copy if possible. .. note:: @@ -376,8 +378,6 @@ You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` - - .. deprecated:: 3.0.0 indicator : bool or str, default False If True, adds a column to the output DataFrame called "_merge" with information on the source of each row. The column can be given a different @@ -506,7 +506,6 @@ # DataFrame class -@set_module("pandas") class DataFrame(NDFrame, OpsMixin): """ Two-dimensional, size-mutable, potentially heterogeneous tabular data. @@ -536,7 +535,6 @@ class DataFrame(NDFrame, OpsMixin): will perform column selection instead. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. - If ``data`` is DataFrame then is ignored. copy : bool or None, default None Copy data from inputs. For dict data, the default of None behaves like ``copy=True``. For DataFrame @@ -562,7 +560,7 @@ class DataFrame(NDFrame, OpsMixin): -------- Constructing DataFrame from a dictionary. - >>> d = {"col1": [1, 2], "col2": [3, 4]} + >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> df = pd.DataFrame(data=d) >>> df col1 col2 @@ -586,7 +584,7 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from a dictionary including Series: - >>> d = {"col1": [0, 1, 2, 3], "col2": pd.Series([2, 3], index=[2, 3])} + >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) col1 col2 0 0 NaN @@ -596,9 +594,8 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from numpy ndarray: - >>> df2 = pd.DataFrame( - ... np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"] - ... ) + >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + ... columns=['a', 'b', 'c']) >>> df2 a b c 0 1 2 3 @@ -607,11 +604,10 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from a numpy ndarray that has labeled columns: - >>> data = np.array( - ... [(1, 2, 3), (4, 5, 6), (7, 8, 9)], - ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")], - ... ) - >>> df3 = pd.DataFrame(data, columns=["c", "a"]) + >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], + ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) + >>> df3 = pd.DataFrame(data, columns=['c', 'a']) + ... >>> df3 c a 0 3 1 @@ -650,14 +646,14 @@ class DataFrame(NDFrame, OpsMixin): _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) _accessors: set[str] = {"sparse"} _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) - _mgr: BlockManager + _mgr: BlockManager | ArrayManager # similar to __array_priority__, positions DataFrame before Series, Index, # and ExtensionArray. Should NOT be overridden by subclasses. __pandas_priority__ = 4000 @property - def _constructor(self) -> type[DataFrame]: + def _constructor(self) -> Callable[..., DataFrame]: return DataFrame def _constructor_from_mgr(self, mgr, axes) -> DataFrame: @@ -715,7 +711,7 @@ def __init__( # to avoid the result sharing the same Manager data = data.copy(deep=False) - if isinstance(data, BlockManager): + if isinstance(data, (BlockManager, ArrayManager)): if not allow_mgr: # GH#52419 warnings.warn( @@ -723,10 +719,11 @@ def __init__( "is deprecated and will raise in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=2, + stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix ) - data = data.copy(deep=False) + if using_copy_on_write(): + data = data.copy(deep=False) # first check if a Manager is passed without any other arguments # -> use fastpath (without checking Manager type) if index is None and columns is None and dtype is None and not copy: @@ -734,6 +731,12 @@ def __init__( NDFrame.__init__(self, data) return + manager = _get_option("mode.data_manager", silent=True) + + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + # GH47215 if isinstance(index, set): raise ValueError("index cannot be a set") @@ -744,7 +747,17 @@ def __init__( if isinstance(data, dict): # retain pre-GH#38939 default behavior copy = True - elif not isinstance(data, (Index, DataFrame, Series)): + elif ( + manager == "array" + and isinstance(data, (np.ndarray, ExtensionArray)) + and data.ndim == 2 + ): + # INFO(ArrayManager) by default copy the 2D input array to get + # contiguous 1D arrays + copy = True + elif using_copy_on_write() and not isinstance( + data, (Index, DataFrame, Series) + ): copy = True else: copy = False @@ -755,14 +768,14 @@ def __init__( dtype = dtype if dtype is not None else pandas_dtype(object) data = [] - if isinstance(data, BlockManager): + if isinstance(data, (BlockManager, ArrayManager)): mgr = self._init_mgr( data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy ) elif isinstance(data, dict): # GH#38939 de facto copy defaults to False only in non-dict cases - mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy) + mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) elif isinstance(data, ma.MaskedArray): from numpy.ma import mrecords @@ -782,6 +795,7 @@ def __init__( columns, dtype=dtype, copy=copy, + typ=manager, ) elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): @@ -794,9 +808,11 @@ def __init__( columns, dtype, copy, + typ=manager, ) elif getattr(data, "name", None) is not None: # i.e. Series/Index with non-None name + _copy = copy if using_copy_on_write() else True mgr = dict_to_mgr( # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no # attribute "name" @@ -804,7 +820,8 @@ def __init__( index, columns, dtype=dtype, - copy=copy, + typ=manager, + copy=_copy, ) else: mgr = ndarray_to_mgr( @@ -813,6 +830,7 @@ def __init__( columns, dtype=dtype, copy=copy, + typ=manager, ) # For data is list-like, or Iterable (will consume into list) @@ -843,6 +861,7 @@ def __init__( columns, index, dtype=dtype, + typ=manager, ) else: mgr = ndarray_to_mgr( @@ -851,6 +870,7 @@ def __init__( columns, dtype=dtype, copy=copy, + typ=manager, ) else: mgr = dict_to_mgr( @@ -858,6 +878,7 @@ def __init__( index, columns if columns is not None else default_index(0), dtype=dtype, + typ=manager, ) # For data is scalar else: @@ -878,7 +899,7 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, dtype=None) + mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) else: arr2d = construct_2d_arraylike_from_scalar( data, @@ -894,10 +915,26 @@ def __init__( columns, dtype=arr2d.dtype, copy=False, + typ=manager, ) + # ensure correct Manager type according to settings + mgr = mgr_to_mgr(mgr, typ=manager) + NDFrame.__init__(self, mgr) + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtypes.iloc[0] != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The DataFrame " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + # ---------------------------------------------------------------------- def __dataframe__( @@ -906,19 +943,6 @@ def __dataframe__( """ Return the dataframe interchange object implementing the interchange protocol. - .. note:: - - For new development, we highly recommend using the Arrow C Data Interface - alongside the Arrow PyCapsule Interface instead of the interchange protocol - - .. warning:: - - Due to severe implementation issues, we recommend only considering using the - interchange protocol in the following cases: - - - converting to pandas: for pandas >= 2.0.3 - - converting from pandas: for pandas >= 3.0.0 - Parameters ---------- nan_as_null : bool, default False @@ -933,11 +957,6 @@ def __dataframe__( DataFrame interchange object The object which consuming library can use to ingress the dataframe. - See Also - -------- - DataFrame.from_records : Constructor from tuples, also record arrays. - DataFrame.from_dict : From dicts of Series, arrays, or dicts. - Notes ----- Details on the interchange protocol: @@ -945,13 +964,12 @@ def __dataframe__( Examples -------- - >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) >>> interchange_object = df_not_necessarily_pandas.__dataframe__() >>> interchange_object.column_names() Index(['A', 'B'], dtype='object') - >>> df_pandas = pd.api.interchange.from_dataframe( - ... interchange_object.select_columns_by_name(["A"]) - ... ) + >>> df_pandas = (pd.api.interchange.from_dataframe + ... (interchange_object.select_columns_by_name(['A']))) >>> df_pandas A 0 1 @@ -965,6 +983,21 @@ def __dataframe__( return PandasDataFrameXchg(self, allow_copy=allow_copy) + def __dataframe_consortium_standard__( + self, *, api_version: str | None = None + ) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of pandas. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + dataframe_api_compat = import_optional_dependency("dataframe_api_compat") + convert_to_standard_compliant_dataframe = ( + dataframe_api_compat.pandas_standard.convert_to_standard_compliant_dataframe + ) + return convert_to_standard_compliant_dataframe(self, api_version=api_version) + def __arrow_c_stream__(self, requested_schema=None): """ Export the pandas DataFrame as an Arrow C stream PyCapsule. @@ -1002,14 +1035,9 @@ def axes(self) -> list[Index]: It has the row axis labels and column axis labels as the only members. They are returned in that order. - See Also - -------- - DataFrame.index: The index (row labels) of the DataFrame. - DataFrame.columns: The column labels of the DataFrame. - Examples -------- - >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], dtype='object')] @@ -1021,21 +1049,18 @@ def shape(self) -> tuple[int, int]: """ Return a tuple representing the dimensionality of the DataFrame. - Unlike the `len()` method, which only returns the number of rows, `shape` - provides both row and column counts, making it a more informative method for - understanding dataset size. - See Also -------- - numpy.ndarray.shape : Tuple of array dimensions. + ndarray.shape : Tuple of array dimensions. Examples -------- - >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.shape (2, 2) - >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}) + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], + ... 'col3': [5, 6]}) >>> df.shape (2, 3) """ @@ -1060,22 +1085,21 @@ def _is_homogeneous_type(self) -> bool: Items with the same type but different sizes are considered different types. - >>> DataFrame( - ... { - ... "A": np.array([1, 2], dtype=np.int32), - ... "B": np.array([1, 2], dtype=np.int64), - ... } - ... )._is_homogeneous_type + >>> DataFrame({ + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type False """ # The "<" part of "<=" here is for empty DataFrame cases - return len({block.values.dtype for block in self._mgr.blocks}) <= 1 + return len({arr.dtype for arr in self._mgr.arrays}) <= 1 @property def _can_fast_transpose(self) -> bool: """ Can we transpose this DataFrame without creating any new array objects. """ + if isinstance(self._mgr, ArrayManager): + return False blocks = self._mgr.blocks if len(blocks) != 1: return False @@ -1091,6 +1115,13 @@ def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: """ mgr = self._mgr + if isinstance(mgr, ArrayManager): + if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): + # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" + # has no attribute "reshape" + return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] + return ensure_wrapped_if_datetimelike(self.values) + blocks = mgr.blocks if len(blocks) != 1: return ensure_wrapped_if_datetimelike(self.values) @@ -1201,7 +1232,6 @@ def _repr_html_(self) -> str | None: min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") - show_floats = get_option("display.float_format") formatter = fmt.DataFrameFormatter( self, @@ -1209,7 +1239,7 @@ def _repr_html_(self) -> str | None: col_space=None, na_rep="NaN", formatters=None, - float_format=show_floats, + float_format=None, sparsify=None, justify=None, index_names=True, @@ -1231,7 +1261,6 @@ def _repr_html_(self) -> str | None: def to_string( self, buf: None = ..., - *, columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., header: bool | SequenceNotStr[str] = ..., @@ -1250,13 +1279,13 @@ def to_string( min_rows: int | None = ..., max_colwidth: int | None = ..., encoding: str | None = ..., - ) -> str: ... + ) -> str: + ... @overload def to_string( self, buf: FilePath | WriteBuffer[str], - *, columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., header: bool | SequenceNotStr[str] = ..., @@ -1275,8 +1304,12 @@ def to_string( min_rows: int | None = ..., max_colwidth: int | None = ..., encoding: str | None = ..., - ) -> None: ... + ) -> None: + ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_string" + ) @Substitution( header_type="bool or list of str", header="Write out the column names. If a list of columns " @@ -1291,7 +1324,6 @@ def to_string( def to_string( self, buf: FilePath | WriteBuffer[str] | None = None, - *, columns: Axes | None = None, col_space: int | list[int] | dict[Hashable, int] | None = None, header: bool | SequenceNotStr[str] = True, @@ -1330,7 +1362,7 @@ def to_string( Examples -------- - >>> d = {"col1": [1, 2, 3], "col2": [4, 5, 6]} + >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} >>> df = pd.DataFrame(d) >>> print(df.to_string()) col1 col2 @@ -1373,7 +1405,7 @@ def _get_values_for_csv( decimal: str, na_rep: str, quoting, # int csv.QUOTE_FOO from stdlib - ) -> DataFrame: + ) -> Self: # helper used by to_csv mgr = self._mgr.get_values_for_csv( float_format=float_format, @@ -1382,7 +1414,8 @@ def _get_values_for_csv( na_rep=na_rep, quoting=quoting, ) - return self._constructor_from_mgr(mgr, axes=mgr.axes) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value] # ---------------------------------------------------------------------- @@ -1400,7 +1433,7 @@ def style(self) -> Styler: Examples -------- - >>> df = pd.DataFrame({"A": [1, 2, 3]}) + >>> df = pd.DataFrame({'A': [1, 2, 3]}) >>> df.style # doctest: +SKIP Please see @@ -1415,7 +1448,9 @@ def style(self) -> Styler: return Styler(self) - _shared_docs["items"] = r""" + _shared_docs[ + "items" + ] = r""" Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with @@ -1465,8 +1500,12 @@ def style(self) -> Styler: @Appender(_shared_docs["items"]) def items(self) -> Iterable[tuple[Hashable, Series]]: - for i, k in enumerate(self.columns): - yield k, self._ixs(i, axis=1) + if self.columns.is_unique and hasattr(self, "_item_cache"): + for k in self.columns: + yield k, self._get_item_cache(k) + else: + for i, k in enumerate(self.columns): + yield k, self._ixs(i, axis=1) def iterrows(self) -> Iterable[tuple[Hashable, Series]]: """ @@ -1502,23 +1541,24 @@ def iterrows(self) -> Iterable[tuple[Hashable, Series]]: Examples -------- - >>> df = pd.DataFrame([[1, 1.5]], columns=["int", "float"]) + >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) >>> row = next(df.iterrows())[1] >>> row int 1.0 float 1.5 Name: 0, dtype: float64 - >>> print(row["int"].dtype) + >>> print(row['int'].dtype) float64 - >>> print(df["int"].dtype) + >>> print(df['int'].dtype) int64 """ columns = self.columns klass = self._constructor_sliced + using_cow = using_copy_on_write() for k, v in zip(self.index, self.values): s = klass(v, index=columns, name=k).__finalize__(self) - if self._mgr.is_single_block: - s._mgr.add_references(self._mgr) + if using_cow and self._mgr.is_single_block: + s._mgr.add_references(self._mgr) # type: ignore[arg-type] yield k, s def itertuples( @@ -1555,15 +1595,15 @@ def itertuples( Examples -------- - >>> df = pd.DataFrame( - ... {"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"] - ... ) + >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, + ... index=['dog', 'hawk']) >>> df num_legs num_wings dog 4 0 hawk 2 2 >>> for row in df.itertuples(): ... print(row) + ... Pandas(Index='dog', num_legs=4, num_wings=0) Pandas(Index='hawk', num_legs=2, num_wings=2) @@ -1572,14 +1612,16 @@ def itertuples( >>> for row in df.itertuples(index=False): ... print(row) + ... Pandas(num_legs=4, num_wings=0) Pandas(num_legs=2, num_wings=2) With the `name` parameter set we set a custom name for the yielded namedtuples: - >>> for row in df.itertuples(name="Animal"): + >>> for row in df.itertuples(name='Animal'): ... print(row) + ... Animal(Index='dog', num_legs=4, num_wings=0) Animal(Index='hawk', num_legs=2, num_wings=2) """ @@ -1610,10 +1652,12 @@ def __len__(self) -> int: return len(self.index) @overload - def dot(self, other: Series) -> Series: ... + def dot(self, other: Series) -> Series: + ... @overload - def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: ... + def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: + ... def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: """ @@ -1697,8 +1741,8 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: if len(common) > len(self.columns) or len(common) > len(other.index): raise ValueError("matrices are not aligned") - left = self.reindex(columns=common) - right = other.reindex(index=common) + left = self.reindex(columns=common, copy=False) + right = other.reindex(index=common, copy=False) lvals = left.values rvals = right._values else: @@ -1734,10 +1778,12 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: raise TypeError(f"unsupported type: {type(other)}") @overload - def __matmul__(self, other: Series) -> Series: ... + def __matmul__(self, other: Series) -> Series: + ... @overload - def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: ... + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + ... def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: """ @@ -1810,7 +1856,7 @@ def from_dict( -------- By default the keys of the dict become the DataFrame columns: - >>> data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} + >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} >>> pd.DataFrame.from_dict(data) col_1 col_2 0 3 a @@ -1821,8 +1867,8 @@ def from_dict( Specify ``orient='index'`` to create the DataFrame using dictionary keys as rows: - >>> data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]} - >>> pd.DataFrame.from_dict(data, orient="index") + >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data, orient='index') 0 1 2 3 row_1 3 2 1 0 row_2 a b c d @@ -1830,7 +1876,8 @@ def from_dict( When using the 'index' orientation, the column names can be specified manually: - >>> pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]) + >>> pd.DataFrame.from_dict(data, orient='index', + ... columns=['A', 'B', 'C', 'D']) A B C D row_1 3 2 1 0 row_2 a b c d @@ -1838,21 +1885,19 @@ def from_dict( Specify ``orient='tight'`` to create the DataFrame using a 'tight' format: - >>> data = { - ... "index": [("a", "b"), ("a", "c")], - ... "columns": [("x", 1), ("y", 2)], - ... "data": [[1, 3], [2, 4]], - ... "index_names": ["n1", "n2"], - ... "column_names": ["z1", "z2"], - ... } - >>> pd.DataFrame.from_dict(data, orient="tight") + >>> data = {'index': [('a', 'b'), ('a', 'c')], + ... 'columns': [('x', 1), ('y', 2)], + ... 'data': [[1, 3], [2, 4]], + ... 'index_names': ['n1', 'n2'], + ... 'column_names': ['z1', 'z2']} + >>> pd.DataFrame.from_dict(data, orient='tight') z1 x y z2 1 2 n1 n2 a b 1 3 c 2 4 """ - index: list | Index | None = None + index = None orient = orient.lower() # type: ignore[assignment] if orient == "index": if len(data) > 0: @@ -1878,7 +1923,7 @@ def from_dict( else: realdata = data["data"] - def create_index(indexlist, namelist) -> Index: + def create_index(indexlist, namelist): index: Index if len(namelist) > 1: index = MultiIndex.from_tuples(indexlist, names=namelist) @@ -1921,7 +1966,6 @@ def to_numpy( Returns ------- numpy.ndarray - The NumPy array representing the values in the DataFrame. See Also -------- @@ -1944,7 +1988,7 @@ def to_numpy( For a mix of numeric and non-numeric types, the output array will have object dtype. - >>> df["C"] = pd.date_range("2000", periods=2) + >>> df['C'] = pd.date_range('2000', periods=2) >>> df.to_numpy() array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) @@ -1957,6 +2001,28 @@ def to_numpy( return result + def _create_data_for_split_and_tight_to_dict( + self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] + ) -> list: + """ + Simple helper method to create data for to ``to_dict(orient="split")`` and + ``to_dict(orient="tight")`` to create the main output data + """ + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] + if object_dtype_indices: + # If we have object_dtype_cols, apply maybe_box_naive after list + # comprehension for perf + for row in data: + for i in object_dtype_indices: + row[i] = maybe_box_native(row[i]) + return data + @overload def to_dict( self, @@ -1964,7 +2030,8 @@ def to_dict( *, into: type[MutableMappingT] | MutableMappingT, index: bool = ..., - ) -> MutableMappingT: ... + ) -> MutableMappingT: + ... @overload def to_dict( @@ -1973,7 +2040,8 @@ def to_dict( *, into: type[MutableMappingT] | MutableMappingT, index: bool = ..., - ) -> list[MutableMappingT]: ... + ) -> list[MutableMappingT]: + ... @overload def to_dict( @@ -1982,7 +2050,8 @@ def to_dict( *, into: type[dict] = ..., index: bool = ..., - ) -> dict: ... + ) -> dict: + ... @overload def to_dict( @@ -1991,17 +2060,21 @@ def to_dict( *, into: type[dict] = ..., index: bool = ..., - ) -> list[dict]: ... + ) -> list[dict]: + ... # error: Incompatible default for argument "into" (default has type "type # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "orient"], name="to_dict" + ) def to_dict( self, orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - *, - into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, ) -> MutableMappingT | list[MutableMappingT]: """ @@ -2039,9 +2112,7 @@ def to_dict( index : bool, default True Whether to include the index item (and index_names item if `orient` is 'tight') in the returned dictionary. Can only be ``False`` - when `orient` is 'split' or 'tight'. Note that when `orient` is - 'records', this parameter does not take effect (index item always - not included). + when `orient` is 'split' or 'tight'. .. versionadded:: 2.0.0 @@ -2059,9 +2130,9 @@ def to_dict( Examples -------- - >>> df = pd.DataFrame( - ... {"col1": [1, 2], "col2": [0.5, 0.75]}, index=["row1", "row2"] - ... ) + >>> df = pd.DataFrame({'col1': [1, 2], + ... 'col2': [0.5, 0.75]}, + ... index=['row1', 'row2']) >>> df col1 col2 row1 1 0.50 @@ -2071,7 +2142,7 @@ def to_dict( You can specify the return orientation. - >>> df.to_dict("series") + >>> df.to_dict('series') {'col1': row1 1 row2 2 Name: col1, dtype: int64, @@ -2079,17 +2150,17 @@ def to_dict( row2 0.75 Name: col2, dtype: float64} - >>> df.to_dict("split") + >>> df.to_dict('split') {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]]} - >>> df.to_dict("records") + >>> df.to_dict('records') [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] - >>> df.to_dict("index") + >>> df.to_dict('index') {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} - >>> df.to_dict("tight") + >>> df.to_dict('tight') {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} @@ -2103,7 +2174,7 @@ def to_dict( If you want a `defaultdict`, you need to initialize it: >>> dd = defaultdict(list) - >>> df.to_dict("records", into=dd) + >>> df.to_dict('records', into=dd) [defaultdict(, {'col1': 1, 'col2': 0.5}), defaultdict(, {'col1': 2, 'col2': 0.75})] """ @@ -2111,6 +2182,144 @@ def to_dict( return to_dict(self, orient, into=into, index=index) + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "destination_table"], name="to_gbq" + ) + def to_gbq( + self, + destination_table: str, + project_id: str | None = None, + chunksize: int | None = None, + reauth: bool = False, + if_exists: ToGbqIfexist = "fail", + auth_local_webserver: bool = True, + table_schema: list[dict[str, str]] | None = None, + location: str | None = None, + progress_bar: bool = True, + credentials=None, + ) -> None: + """ + Write a DataFrame to a Google BigQuery table. + + .. deprecated:: 2.2.0 + + Please use ``pandas_gbq.to_gbq`` instead. + + This function requires the `pandas-gbq package + `__. + + See the `How to authenticate with Google BigQuery + `__ + guide for authentication instructions. + + Parameters + ---------- + destination_table : str + Name of table to be written, in the form ``dataset.tablename``. + project_id : str, optional + Google BigQuery Account project ID. Optional when available from + the environment. + chunksize : int, optional + Number of rows to be inserted in each chunk from the dataframe. + Set to ``None`` to load the whole dataframe at once. + reauth : bool, default False + Force Google BigQuery to re-authenticate the user. This is useful + if multiple accounts are used. + if_exists : str, default 'fail' + Behavior when the destination table exists. Value can be one of: + + ``'fail'`` + If table exists raise pandas_gbq.gbq.TableCreationError. + ``'replace'`` + If table exists, drop it, recreate it, and insert data. + ``'append'`` + If table exists, insert data. Create if does not exist. + auth_local_webserver : bool, default True + Use the `local webserver flow`_ instead of the `console flow`_ + when getting user credentials. + + .. _local webserver flow: + https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server + .. _console flow: + https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + + *New in version 0.2.0 of pandas-gbq*. + + .. versionchanged:: 1.5.0 + Default value is changed to ``True``. Google has deprecated the + ``auth_local_webserver = False`` `"out of band" (copy-paste) + flow + `_. + table_schema : list of dicts, optional + List of BigQuery table fields to which according DataFrame + columns conform to, e.g. ``[{'name': 'col1', 'type': + 'STRING'},...]``. If schema is not provided, it will be + generated according to dtypes of DataFrame columns. See + BigQuery API documentation on available names of a field. + + *New in version 0.3.1 of pandas-gbq*. + location : str, optional + Location where the load job should run. See the `BigQuery locations + documentation + `__ for a + list of available locations. The location must match that of the + target dataset. + + *New in version 0.5.0 of pandas-gbq*. + progress_bar : bool, default True + Use the library `tqdm` to show the progress bar for the upload, + chunk by chunk. + + *New in version 0.5.0 of pandas-gbq*. + credentials : google.auth.credentials.Credentials, optional + Credentials for accessing Google APIs. Use this parameter to + override default credentials, such as to use Compute Engine + :class:`google.auth.compute_engine.Credentials` or Service + Account :class:`google.oauth2.service_account.Credentials` + directly. + + *New in version 0.8.0 of pandas-gbq*. + + See Also + -------- + pandas_gbq.to_gbq : This function in the pandas-gbq library. + read_gbq : Read a DataFrame from Google BigQuery. + + Examples + -------- + Example taken from `Google BigQuery documentation + `_ + + >>> project_id = "my-project" + >>> table_id = 'my_dataset.my_table' + >>> df = pd.DataFrame({ + ... "my_string": ["a", "b", "c"], + ... "my_int64": [1, 2, 3], + ... "my_float64": [4.0, 5.0, 6.0], + ... "my_bool1": [True, False, True], + ... "my_bool2": [False, True, False], + ... "my_dates": pd.date_range("now", periods=3), + ... } + ... ) + + >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP + """ + from pandas.io import gbq + + gbq.to_gbq( + self, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + ) + @classmethod def from_records( cls, @@ -2124,13 +2333,16 @@ def from_records( """ Convert structured or record ndarray to DataFrame. - Creates a DataFrame object from a structured ndarray, or sequence of - tuples or dicts. + Creates a DataFrame object from a structured ndarray, sequence of + tuples or dicts, or DataFrame. Parameters ---------- - data : structured ndarray, sequence of tuples or dicts + data : structured ndarray, sequence of tuples or dicts, or DataFrame Structured input data. + + .. deprecated:: 2.1.0 + Passing a DataFrame is deprecated. index : str, list of fields, array-like Field of array to use as the index, alternately a specific set of input labels to use. @@ -2139,10 +2351,9 @@ def from_records( columns : sequence, default None Column names to use. If the passed data do not have names associated with them, this argument provides names for the - columns. Otherwise, this argument indicates the order of the columns + columns. Otherwise this argument indicates the order of the columns in the result (any names not found in the data will become all-NA - columns) and limits the data to these columns if not all column names - are provided. + columns). coerce_float : bool, default False Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. @@ -2162,10 +2373,8 @@ def from_records( -------- Data can be provided as a structured ndarray: - >>> data = np.array( - ... [(3, "a"), (2, "b"), (1, "c"), (0, "d")], - ... dtype=[("col_1", "i4"), ("col_2", "U1")], - ... ) + >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], + ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) >>> pd.DataFrame.from_records(data) col_1 col_2 0 3 a @@ -2175,12 +2384,10 @@ def from_records( Data can be provided as a list of dicts: - >>> data = [ - ... {"col_1": 3, "col_2": "a"}, - ... {"col_1": 2, "col_2": "b"}, - ... {"col_1": 1, "col_2": "c"}, - ... {"col_1": 0, "col_2": "d"}, - ... ] + >>> data = [{'col_1': 3, 'col_2': 'a'}, + ... {'col_1': 2, 'col_2': 'b'}, + ... {'col_1': 1, 'col_2': 'c'}, + ... {'col_1': 0, 'col_2': 'd'}] >>> pd.DataFrame.from_records(data) col_1 col_2 0 3 a @@ -2190,8 +2397,8 @@ def from_records( Data can be provided as a list of tuples with corresponding columns: - >>> data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")] - >>> pd.DataFrame.from_records(data, columns=["col_1", "col_2"]) + >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] + >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) col_1 col_2 0 3 a 1 2 b @@ -2199,10 +2406,21 @@ def from_records( 3 0 d """ if isinstance(data, DataFrame): - raise TypeError( - "Passing a DataFrame to DataFrame.from_records is not supported. Use " + warnings.warn( + "Passing a DataFrame to DataFrame.from_records is deprecated. Use " "set_index and/or drop to modify the DataFrame instead.", + FutureWarning, + stacklevel=find_stack_level(), ) + if columns is not None: + if is_scalar(columns): + columns = [columns] + data = data[columns] + if index is not None: + data = data.set_index(index) + if exclude is not None: + data = data.drop(columns=exclude) + return data.copy(deep=False) result_index = None @@ -2215,7 +2433,7 @@ def maybe_reorder( ) -> tuple[list[ArrayLike], Index, Index | None]: """ If our desired 'columns' do not match the data's pre-existing 'arr_columns', - we re-order our arrays. This is like a preemptive (cheap) reindex. + we re-order our arrays. This is like a pre-emptive (cheap) reindex. """ if len(arrays): length = len(arrays[0]) @@ -2319,17 +2537,16 @@ def maybe_reorder( exclude.update(index) if any(exclude): - arr_exclude = (x for x in exclude if x in arr_columns) - to_remove = {arr_columns.get_loc(col) for col in arr_exclude} # pyright: ignore[reportUnhashable] + arr_exclude = [x for x in exclude if x in arr_columns] + to_remove = [arr_columns.get_loc(col) for col in arr_exclude] arrays = [v for i, v in enumerate(arrays) if i not in to_remove] columns = columns.drop(exclude) - mgr = arrays_to_mgr(arrays, columns, result_index) - df = DataFrame._from_mgr(mgr, axes=mgr.axes) - if cls is not DataFrame: - return cls(df, copy=False) - return df + manager = _get_option("mode.data_manager", silent=True) + mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) + + return cls._from_mgr(mgr, axes=mgr.axes) def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None @@ -2372,7 +2589,8 @@ def to_records( Examples -------- - >>> df = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"]) + >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, + ... index=['a', 'b']) >>> df A B a 1 0.50 @@ -2526,6 +2744,7 @@ def _from_arrays( if dtype is not None: dtype = pandas_dtype(dtype) + manager = _get_option("mode.data_manager", silent=True) columns = ensure_index(columns) if len(columns) != len(arrays): raise ValueError("len(columns) must match len(arrays)") @@ -2535,6 +2754,7 @@ def _from_arrays( index, dtype=dtype, verify_integrity=verify_integrity, + typ=manager, ) return cls._from_mgr(mgr, axes=mgr.axes) @@ -2643,10 +2863,10 @@ def to_stata( Examples -------- - >>> df = pd.DataFrame( - ... [["falcon", 350], ["parrot", 18]], columns=["animal", "parrot"] - ... ) - >>> df.to_stata("animals.dta") # doctest: +SKIP + >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', + ... 'parrot'], + ... 'speed': [350, 18, 361, 15]}}) + >>> df.to_stata('animals.dta') # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): raise ValueError("Only formats 114, 117, 118 and 119 are supported.") @@ -2706,16 +2926,6 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: This includes the `compression`, `compression_level`, `chunksize` and `version` keywords. - See Also - -------- - DataFrame.to_parquet : Write a DataFrame to the binary parquet format. - DataFrame.to_excel : Write object to an Excel sheet. - DataFrame.to_sql : Write to a sql table. - DataFrame.to_csv : Write a csv file. - DataFrame.to_json : Convert the object to a JSON string. - DataFrame.to_html : Render a DataFrame as an HTML table. - DataFrame.to_string : Convert DataFrame to a string. - Notes ----- This function writes the dataframe as a `feather file @@ -2732,88 +2942,14 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: to_feather(self, path, **kwargs) - @overload - def to_markdown( - self, - buf: None = ..., - *, - mode: str = ..., - index: bool = ..., - storage_options: StorageOptions | None = ..., - **kwargs, - ) -> str: ... - - @overload - def to_markdown( - self, - buf: FilePath | WriteBuffer[str], - *, - mode: str = ..., - index: bool = ..., - storage_options: StorageOptions | None = ..., - **kwargs, - ) -> None: ... - - @overload - def to_markdown( - self, - buf: FilePath | WriteBuffer[str] | None, - *, - mode: str = ..., - index: bool = ..., - storage_options: StorageOptions | None = ..., - **kwargs, - ) -> str | None: ... - - def to_markdown( - self, - buf: FilePath | WriteBuffer[str] | None = None, - *, - mode: str = "wt", - index: bool = True, - storage_options: StorageOptions | None = None, - **kwargs, - ) -> str | None: - """ - Print DataFrame in Markdown-friendly format. - - Parameters - ---------- - buf : str, Path or StringIO-like, optional, default None - Buffer to write to. If None, the output is returned as a string. - mode : str, optional - Mode in which file is opened, "wt" by default. - index : bool, optional, default True - Add index (row) labels. - - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc. For HTTP(S) URLs the key-value pairs - are forwarded to ``urllib.request.Request`` as header options. For other - URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are - forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more - details, and for more examples on storage options refer `here - `_. - - **kwargs - These parameters will be passed to `tabulate `_. - - Returns - ------- - str - DataFrame in Markdown-friendly format. - - See Also - -------- - DataFrame.to_html : Render DataFrame to HTML-formatted table. - DataFrame.to_latex : Render DataFrame to LaTeX-formatted table. - - Notes - ----- - Requires the `tabulate `_ package. - - Examples + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_markdown" + ) + @doc( + Series.to_markdown, + klass=_shared_doc_kwargs["klass"], + storage_options=_shared_docs["storage_options"], + examples="""Examples -------- >>> df = pd.DataFrame( ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} @@ -2833,8 +2969,16 @@ def to_markdown( | 0 | elk | dog | +----+------------+------------+ | 1 | pig | quetzal | - +----+------------+------------+ - """ + +----+------------+------------+""", + ) + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None = None, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> str | None: if "showindex" in kwargs: raise ValueError("Pass 'index' instead of 'showindex") @@ -2854,33 +2998,35 @@ def to_markdown( def to_parquet( self, path: None = ..., - *, engine: Literal["auto", "pyarrow", "fastparquet"] = ..., compression: str | None = ..., index: bool | None = ..., partition_cols: list[str] | None = ..., storage_options: StorageOptions = ..., **kwargs, - ) -> bytes: ... + ) -> bytes: + ... @overload def to_parquet( self, path: FilePath | WriteBuffer[bytes], - *, engine: Literal["auto", "pyarrow", "fastparquet"] = ..., compression: str | None = ..., index: bool | None = ..., partition_cols: list[str] | None = ..., storage_options: StorageOptions = ..., **kwargs, - ) -> None: ... + ) -> None: + ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path"], name="to_parquet" + ) @doc(storage_options=_shared_docs["storage_options"]) def to_parquet( self, path: FilePath | WriteBuffer[bytes] | None = None, - *, engine: Literal["auto", "pyarrow", "fastparquet"] = "auto", compression: str | None = "snappy", index: bool | None = None, @@ -2932,9 +3078,6 @@ def to_parquet( Returns ------- bytes if no path argument is provided else None - Returns the DataFrame converted to the binary parquet format as bytes if no - path argument. Returns None and writes the DataFrame to the specified - location in the Parquet format if the path argument is provided. See Also -------- @@ -2946,22 +3089,16 @@ def to_parquet( Notes ----- - * This function requires either the `fastparquet - `_ or `pyarrow - `_ library. - * When saving a DataFrame with categorical columns to parquet, - the file size may increase due to the inclusion of all possible - categories, not just those present in the data. This behavior - is expected and consistent with pandas' handling of categorical data. - To manage file size and ensure a more predictable roundtrip process, - consider using :meth:`Categorical.remove_unused_categories` on the - DataFrame before saving. + This function requires either the `fastparquet + `_ or `pyarrow + `_ library. Examples -------- - >>> df = pd.DataFrame(data={{"col1": [1, 2], "col2": [3, 4]}}) - >>> df.to_parquet("df.parquet.gzip", compression="gzip") # doctest: +SKIP - >>> pd.read_parquet("df.parquet.gzip") # doctest: +SKIP + >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df.to_parquet('df.parquet.gzip', + ... compression='gzip') # doctest: +SKIP + >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP col1 col2 0 1 3 1 2 4 @@ -2989,36 +3126,6 @@ def to_parquet( **kwargs, ) - @overload - def to_orc( - self, - path: None = ..., - *, - engine: Literal["pyarrow"] = ..., - index: bool | None = ..., - engine_kwargs: dict[str, Any] | None = ..., - ) -> bytes: ... - - @overload - def to_orc( - self, - path: FilePath | WriteBuffer[bytes], - *, - engine: Literal["pyarrow"] = ..., - index: bool | None = ..., - engine_kwargs: dict[str, Any] | None = ..., - ) -> None: ... - - @overload - def to_orc( - self, - path: FilePath | WriteBuffer[bytes] | None, - *, - engine: Literal["pyarrow"] = ..., - index: bool | None = ..., - engine_kwargs: dict[str, Any] | None = ..., - ) -> bytes | None: ... - def to_orc( self, path: FilePath | WriteBuffer[bytes] | None = None, @@ -3028,7 +3135,7 @@ def to_orc( engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ - Write a DataFrame to the Optimized Row Columnar (ORC) format. + Write a DataFrame to the ORC format. .. versionadded:: 1.5.0 @@ -3055,8 +3162,7 @@ def to_orc( Returns ------- - bytes if no ``path`` argument is provided else None - Bytes object with DataFrame data if ``path`` is not specified else None. + bytes if no path argument is provided else None Raises ------ @@ -3076,8 +3182,6 @@ def to_orc( Notes ----- - * Find more information on ORC - `here `__. * Before using this function you should read the :ref:`user guide about ORC ` and :ref:`install optional dependencies `. * This function requires `pyarrow `_ @@ -3089,9 +3193,9 @@ def to_orc( Examples -------- - >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) - >>> df.to_orc("df.orc") # doctest: +SKIP - >>> pd.read_orc("df.orc") # doctest: +SKIP + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) + >>> df.to_orc('df.orc') # doctest: +SKIP + >>> pd.read_orc('df.orc') # doctest: +SKIP col1 col2 0 1 4 1 2 3 @@ -3114,7 +3218,6 @@ def to_orc( def to_html( self, buf: FilePath | WriteBuffer[str], - *, columns: Axes | None = ..., col_space: ColspaceArgType | None = ..., header: bool = ..., @@ -3137,13 +3240,13 @@ def to_html( table_id: str | None = ..., render_links: bool = ..., encoding: str | None = ..., - ) -> None: ... + ) -> None: + ... @overload def to_html( self, buf: None = ..., - *, columns: Axes | None = ..., col_space: ColspaceArgType | None = ..., header: bool = ..., @@ -3166,8 +3269,12 @@ def to_html( table_id: str | None = ..., render_links: bool = ..., encoding: str | None = ..., - ) -> str: ... + ) -> str: + ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_html" + ) @Substitution( header_type="bool", header="Whether to print column labels, default True", @@ -3179,7 +3286,6 @@ def to_html( def to_html( self, buf: FilePath | WriteBuffer[str] | None = None, - *, columns: Axes | None = None, col_space: ColspaceArgType | None = None, header: bool = True, @@ -3214,13 +3320,9 @@ def to_html( Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. - border : int or bool - When an integer value is provided, it sets the border attribute in - the opening tag, specifying the thickness of the border. - If ``False`` or ``0`` is passed, the border attribute will not - be present in the ```` tag. - The default value for this parameter is governed by - ``pd.options.display.html.border``. + border : int + A ``border=border`` attribute is included in the opening + `
` tag. Default ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
` tag if specified. render_links : bool, default False @@ -3234,7 +3336,7 @@ def to_html( Examples -------- - >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) >>> html_string = '''
... ... @@ -3311,7 +3413,8 @@ def to_xml( stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., compression: CompressionOptions = ..., storage_options: StorageOptions | None = ..., - ) -> str: ... + ) -> str: + ... @overload def to_xml( @@ -3333,8 +3436,12 @@ def to_xml( stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., compression: CompressionOptions = ..., storage_options: StorageOptions | None = ..., - ) -> None: ... + ) -> None: + ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml" + ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buffer", @@ -3342,7 +3449,6 @@ def to_xml( def to_xml( self, path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, - *, index: bool = True, root_name: str | None = "data", row_name: str | None = "row", @@ -3435,10 +3541,9 @@ def to_xml( Examples -------- - >>> df = pd.DataFrame( - ... [["square", 360, 4], ["circle", 360, np.nan], ["triangle", 180, 3]], - ... columns=["shape", "degrees", "sides"], - ... ) + >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], + ... 'degrees': [360, 360, 180], + ... 'sides': [4, np.nan, 3]}}) >>> df.to_xml() # doctest: +SKIP @@ -3463,9 +3568,9 @@ def to_xml( - >>> df.to_xml( - ... attr_cols=["index", "shape", "degrees", "sides"] - ... ) # doctest: +SKIP + >>> df.to_xml(attr_cols=[ + ... 'index', 'shape', 'degrees', 'sides' + ... ]) # doctest: +SKIP @@ -3473,9 +3578,8 @@ def to_xml( - >>> df.to_xml( - ... namespaces={{"doc": "https://example.com"}}, prefix="doc" - ... ) # doctest: +SKIP + >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, + ... prefix="doc") # doctest: +SKIP @@ -3607,8 +3711,9 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: Examples -------- - >>> dtypes = ["int64", "float64", "complex128", "object", "bool"] - >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) for t in dtypes]) + >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) + ... for t in dtypes]) >>> df = pd.DataFrame(data) >>> df.head() int64 float64 complex128 object bool @@ -3649,8 +3754,8 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: Use a Categorical for efficient storage of an object-dtype column with many repeated values. - >>> df["object"].astype("category").memory_usage(deep=True) - 5136 + >>> df['object'].astype('category').memory_usage(deep=True) + 5244 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], @@ -3664,11 +3769,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: result = index_memory_usage._append(result) return result - def transpose( - self, - *args, - copy: bool | lib.NoDefault = lib.no_default, - ) -> DataFrame: + def transpose(self, *args, copy: bool = False) -> DataFrame: """ Transpose index and columns. @@ -3699,8 +3800,6 @@ def transpose( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` - .. deprecated:: 3.0.0 - Returns ------- DataFrame @@ -3720,7 +3819,7 @@ def transpose( -------- **Square DataFrame with homogeneous dtype** - >>> d1 = {"col1": [1, 2], "col2": [3, 4]} + >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = pd.DataFrame(data=d1) >>> df1 col1 col2 @@ -3747,12 +3846,10 @@ def transpose( **Non-square DataFrame with mixed dtypes** - >>> d2 = { - ... "name": ["Alice", "Bob"], - ... "score": [9.5, 8], - ... "employed": [False, True], - ... "kids": [0, 0], - ... } + >>> d2 = {'name': ['Alice', 'Bob'], + ... 'score': [9.5, 8], + ... 'employed': [False, True], + ... 'kids': [0, 0]} >>> df2 = pd.DataFrame(data=d2) >>> df2 name score employed kids @@ -3781,15 +3878,16 @@ def transpose( 1 object dtype: object """ - self._check_copy_deprecation(copy) nv.validate_transpose(args, {}) # construct the args - first_dtype = self.dtypes.iloc[0] if len(self.columns) else None + dtypes = list(self.dtypes) if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. new_vals = self._values.T + if copy and not using_copy_on_write(): + new_vals = new_vals.copy() result = self._constructor( new_vals, @@ -3798,16 +3896,16 @@ def transpose( copy=False, dtype=new_vals.dtype, ) - if len(self) > 0: - result._mgr.add_references(self._mgr) + if using_copy_on_write() and len(self) > 0: + result._mgr.add_references(self._mgr) # type: ignore[arg-type] elif ( self._is_homogeneous_type - and first_dtype is not None - and isinstance(first_dtype, ExtensionDtype) + and dtypes + and isinstance(dtypes[0], ExtensionDtype) ): new_values: list - if isinstance(first_dtype, BaseMaskedDtype): + if isinstance(dtypes[0], BaseMaskedDtype): # We have masked arrays with the same dtype. We can transpose faster. from pandas.core.arrays.masked import ( transpose_homogeneous_masked_arrays, @@ -3816,7 +3914,7 @@ def transpose( new_values = transpose_homogeneous_masked_arrays( cast(Sequence[BaseMaskedArray], self._iter_column_arrays()) ) - elif isinstance(first_dtype, ArrowDtype): + elif isinstance(dtypes[0], ArrowDtype): # We have arrow EAs with the same dtype. We can transpose faster. from pandas.core.arrays.arrow.array import ( ArrowExtensionArray, @@ -3828,11 +3926,10 @@ def transpose( ) else: # We have other EAs with the same dtype. We preserve dtype in transpose. - arr_typ = first_dtype.construct_array_type() + dtyp = dtypes[0] + arr_typ = dtyp.construct_array_type() values = self.values - new_values = [ - arr_typ._from_sequence(row, dtype=first_dtype) for row in values - ] + new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] result = type(self)._from_arrays( new_values, @@ -3843,6 +3940,8 @@ def transpose( else: new_arr = self.values.T + if copy and not using_copy_on_write(): + new_arr = new_arr.copy() result = self._constructor( new_arr, index=self.columns, @@ -3870,7 +3969,7 @@ def T(self) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 0 1 3 @@ -3901,14 +4000,24 @@ def _ixs(self, i: int, axis: AxisInt = 0) -> Series: if axis == 0: new_mgr = self._mgr.fast_xs(i) + # if we are a copy, mark as such + copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes) result._name = self.index[i] - return result.__finalize__(self) + result = result.__finalize__(self) + result._set_is_copy(self, copy=copy) + return result # icol else: + label = self.columns[i] + col_mgr = self._mgr.iget(i) - return self._box_col_values(col_mgr, i) + result = self._box_col_values(col_mgr, i) + + # this is a cached value, mark it so + result._set_as_cached(label, self) + return result def _get_column_array(self, i: int) -> ArrayLike: """ @@ -3928,27 +4037,50 @@ def _iter_column_arrays(self) -> Iterator[ArrayLike]: Warning! The returned array is a view but doesn't handle Copy-on-Write, so this should be used with caution (for read-only purposes). """ - for i in range(len(self.columns)): - yield self._get_column_array(i) + if isinstance(self._mgr, ArrayManager): + yield from self._mgr.arrays + else: + for i in range(len(self.columns)): + yield self._get_column_array(i) + + def _getitem_nocopy(self, key: list): + """ + Behaves like __getitem__, but returns a view in cases where __getitem__ + would make a copy. + """ + # TODO(CoW): can be removed if/when we are always Copy-on-Write + indexer = self.columns._get_indexer_strict(key, "columns")[1] + new_axis = self.columns[indexer] + + new_mgr = self._mgr.reindex_indexer( + new_axis, + indexer, + axis=0, + allow_dups=True, + copy=False, + only_slice=True, + ) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = result.__finalize__(self) + return result def __getitem__(self, key): check_dict_or_set_indexers(key) key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) - if is_hashable(key) and not is_iterator(key) and not isinstance(key, slice): + if is_hashable(key) and not is_iterator(key): # is_iterator to exclude generator e.g. test_getitem_listlike - # As of Python 3.12, slice is hashable which breaks MultiIndex (GH#57500) - # shortcut if the key is in columns is_mi = isinstance(self.columns, MultiIndex) # GH#45316 Return view if key is not duplicated # Only use drop_duplicates with duplicates for performance if not is_mi and ( - (self.columns.is_unique and key in self.columns) + self.columns.is_unique + and key in self.columns or key in self.columns.drop_duplicates(keep=False) ): - return self._get_item(key) + return self._get_item_cache(key) elif is_mi and self.columns.is_unique and key in self.columns: return self._getitem_multilevel(key) @@ -3987,7 +4119,7 @@ def __getitem__(self, key): if isinstance(indexer, slice): return self._slice(indexer, axis=1) - data = self.take(indexer, axis=1) + data = self._take_with_is_copy(indexer, axis=1) if is_single_key: # What does looking for a single key in a non-unique index return? @@ -3996,7 +4128,7 @@ def __getitem__(self, key): # - we have a MultiIndex on columns (test on self.columns, #21309) if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): # GH#26490 using data[key] can cause RecursionError - return data._get_item(key) + return data._get_item_cache(key) return data @@ -4022,10 +4154,10 @@ def _getitem_bool_array(self, key): key = check_bool_indexer(self.index, key) if key.all(): - return self.copy(deep=False) + return self.copy(deep=None) indexer = key.nonzero()[0] - return self.take(indexer, axis=0) + return self._take_with_is_copy(indexer, axis=0) def _getitem_multilevel(self, key): # self.columns is a MultiIndex @@ -4055,6 +4187,7 @@ def _getitem_multilevel(self, key): result, index=self.index, name=key ) + result._set_is_copy(self) return result else: # loc is neither a slice nor ndarray, so must be an int @@ -4083,7 +4216,8 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: series = self._ixs(col, axis=1) return series._values[index] - series = self._get_item(col) + series = self._get_item_cache(col) + engine = self.index._engine if not isinstance(self.index, MultiIndex): # CategoricalIndex: Trying to use the engine fastpath may give incorrect @@ -4094,7 +4228,7 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: # For MultiIndex going through engine effectively restricts us to # same-length tuples; see test_get_set_value_no_partial_indexing - loc = self.index._engine.get_loc(index) + loc = engine.get_loc(index) return series._values[loc] def isetitem(self, loc, value) -> None: @@ -4110,11 +4244,6 @@ def isetitem(self, loc, value) -> None: value : scalar or arraylike Value(s) for the column. - See Also - -------- - DataFrame.iloc : Purely integer-location based indexing for selection by - position. - Notes ----- ``frame.isetitem(loc, value)`` is an in-place method as it will @@ -4125,15 +4254,6 @@ def isetitem(self, loc, value) -> None: In cases where ``frame.columns`` is unique, this is equivalent to ``frame[frame.columns[i]] = value``. - - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - >>> df.isetitem(1, [5, 6]) - >>> df - A B - 0 1 5 - 1 2 6 """ if isinstance(value, DataFrame): if is_integer(loc): @@ -4154,11 +4274,22 @@ def isetitem(self, loc, value) -> None: self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs) def __setitem__(self, key, value) -> None: - if not PYPY: + if not PYPY and using_copy_on_write(): if sys.getrefcount(self) <= 3: warnings.warn( _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 ) + elif not PYPY and not using_copy_on_write(): + if sys.getrefcount(self) <= 3 and ( + warn_copy_on_write() + or ( + not warn_copy_on_write() + and any(b.refs.has_reference() for b in self._mgr.blocks) # type: ignore[union-attr] + ) + ): + warnings.warn( + _chained_assignment_warning_msg, FutureWarning, stacklevel=2 + ) key = com.apply_if_callable(key, self) @@ -4188,9 +4319,10 @@ def _setitem_slice(self, key: slice, value) -> None: # NB: we can't just use self.loc[key] = value because that # operates on labels and we need to operate positional for # backwards-compat, xref GH#31469 + self._check_setitem_copy() self.iloc[key] = value - def _setitem_array(self, key, value) -> None: + def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): # bool indexer is indexing along rows @@ -4200,6 +4332,7 @@ def _setitem_array(self, key, value) -> None: ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] + self._check_setitem_copy() if isinstance(value, DataFrame): # GH#39931 reindex since iloc does not align value = value.reindex(self.index.take(indexer)) @@ -4224,12 +4357,12 @@ def _setitem_array(self, key, value) -> None: elif np.ndim(value) > 1: # list of lists value = DataFrame(value).values - self._setitem_array(key, value) + return self._setitem_array(key, value) else: self._iset_not_inplace(key, value) - def _iset_not_inplace(self, key, value) -> None: + def _iset_not_inplace(self, key, value): # GH#39510 when setting with df[key] = obj with a list-like key and # list-like value, we iterate over those listlikes and set columns # one at a time. This is different from dispatching to @@ -4273,7 +4406,7 @@ def igetitem(obj, i: int): finally: self.columns = orig_columns - def _setitem_frame(self, key, value) -> None: + def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 if isinstance(key, np.ndarray): @@ -4281,11 +4414,12 @@ def _setitem_frame(self, key, value) -> None: raise ValueError("Array conditional must be same shape as self") key = self._constructor(key, **self._construct_axes_dict(), copy=False) - if key.size and not all(is_bool_dtype(blk.dtype) for blk in key._mgr.blocks): + if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): raise TypeError( "Must pass DataFrame or 2-d ndarray with boolean values only" ) + self._check_setitem_copy() self._where(-key, value, inplace=True) def _set_item_frame_value(self, key, value: DataFrame) -> None: @@ -4347,6 +4481,7 @@ def _iset_item_mgr( ) -> None: # when called from _set_item_mgr loc can be anything returned from get_loc self._mgr.iset(loc, value, inplace=inplace, refs=refs) + self._clear_item_cache() def _set_item_mgr( self, key, value: ArrayLike, refs: BlockValuesRefs | None = None @@ -4359,10 +4494,27 @@ def _set_item_mgr( else: self._iset_item_mgr(loc, value, refs=refs) + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None: # We are only called from _replace_columnwise which guarantees that # no reindex is necessary - self._iset_item_mgr(loc, value._values, inplace=inplace, refs=value._references) + if using_copy_on_write(): + self._iset_item_mgr( + loc, value._values, inplace=inplace, refs=value._references + ) + else: + self._iset_item_mgr(loc, value._values.copy(), inplace=True) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() def _set_item(self, key, value) -> None: """ @@ -4414,6 +4566,7 @@ def _set_value( icol = self.columns.get_loc(col) iindex = self.index.get_loc(index) self._mgr.column_setitem(icol, iindex, value, inplace_only=True) + self._clear_item_cache() except (KeyError, TypeError, ValueError, LossySetitemError): # get_loc might raise a KeyError for missing labels (falling back @@ -4425,6 +4578,7 @@ def _set_value( self.iloc[index, col] = value else: self.loc[index, col] = value + self._item_cache.pop(col, None) except InvalidIndexError as ii_err: # GH48729: Seems like you are trying to assign a value to a @@ -4456,57 +4610,101 @@ def _ensure_valid_index(self, value) -> None: self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) - def _box_col_values(self, values: SingleBlockManager, loc: int) -> Series: + def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: """ Provide boxed values for a column. """ # Lookup in columns so that if e.g. a str datetime was passed # we attach the Timestamp object as the name. name = self.columns[loc] - # We get index=self.index bc values is a SingleBlockManager + # We get index=self.index bc values is a SingleDataManager obj = self._constructor_sliced_from_mgr(values, axes=values.axes) obj._name = name return obj.__finalize__(self) - def _get_item(self, item: Hashable) -> Series: - loc = self.columns.get_loc(item) - return self._ixs(loc, axis=1) + # ---------------------------------------------------------------------- + # Lookup Caching + + def _clear_item_cache(self) -> None: + self._item_cache.clear() + + def _get_item_cache(self, item: Hashable) -> Series: + """Return the cached item, item represents a label indexer.""" + if using_copy_on_write() or warn_copy_on_write(): + loc = self.columns.get_loc(item) + return self._ixs(loc, axis=1) + + cache = self._item_cache + res = cache.get(item) + if res is None: + # All places that call _get_item_cache have unique columns, + # pending resolution of GH#33047 + + loc = self.columns.get_loc(item) + res = self._ixs(loc, axis=1) + + cache[item] = res + + # for a chain + res._is_copy = self._is_copy + return res + + def _reset_cacher(self) -> None: + # no-op for DataFrame + pass + + def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: + """ + The object has called back to us saying maybe it has changed. + """ + loc = self._info_axis.get_loc(item) + arraylike = value._values + + old = self._ixs(loc, axis=1) + if old._values is value._values and inplace: + # GH#46149 avoid making unnecessary copies/block-splitting + return + + self._mgr.iset(loc, arraylike, inplace=inplace) # ---------------------------------------------------------------------- # Unsorted @overload - def query( - self, expr: str, *, inplace: Literal[False] = ..., **kwargs - ) -> DataFrame: ... + def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame: + ... @overload - def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: ... + def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: + ... @overload - def query( - self, expr: str, *, inplace: bool = ..., **kwargs - ) -> DataFrame | None: ... + def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None: + ... def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None: """ Query the columns of a DataFrame with a boolean expression. - .. warning:: - - This method can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. - Parameters ---------- expr : str The query string to evaluate. - See the documentation for :func:`eval` for details of - supported operations and functions in the query string. + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. - See the documentation for :meth:`DataFrame.eval` for details on - referring to column names and variables in the query string. inplace : bool Whether to modify the DataFrame rather than creating a new one. **kwargs @@ -4571,54 +4769,55 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No quoted string are replaced by strings that are allowed as a Python identifier. These characters include all operators in Python, the space character, the question mark, the exclamation mark, the dollar sign, and the euro sign. - - A backtick can be escaped by double backticks. - - See also the `Python documentation about lexical analysis - `__ + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) in combination with the source code in :mod:`pandas.core.computation.parsing`. Examples -------- - >>> df = pd.DataFrame( - ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} - ... ) + >>> df = pd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) >>> df - A B C&C + A B C C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 - >>> df.query("A > B") - A B C&C + >>> df.query('A > B') + A B C C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] - A B C&C + A B C C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. - >>> df.query("B == `C&C`") - A B C&C + >>> df.query('B == `C C`') + A B C C 0 1 10 10 The previous expression is equivalent to - >>> df[df.B == df["C&C"]] - A B C&C + >>> df[df.B == df['C C']] + A B C C 0 1 10 10 - - Using local variable: - - >>> local_var = 2 - >>> df.query("A <= @local_var") - A B C&C - 0 1 10 10 - 1 2 8 9 """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): @@ -4626,7 +4825,6 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No raise ValueError(msg) kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None - res = self.eval(expr, **kwargs) try: @@ -4643,20 +4841,17 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No return result @overload - def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: ... + def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: + ... @overload - def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: ... + def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: + ... def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: """ Evaluate a string describing operations on DataFrame columns. - .. warning:: - - This method can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. - Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. @@ -4665,23 +4860,6 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: ---------- expr : str The expression string to evaluate. - - You can refer to variables - in the environment by prefixing them with an '@' character like - ``@a + b``. - - You can refer to column names that are not valid Python variable names - by surrounding them in backticks. Thus, column names containing spaces - or punctuation (besides underscores) or starting with digits must be - surrounded by backticks. (For example, a column named "Area (cm^2)" would - be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "if", "for", "import", etc) cannot be used. - - For example, if one of your columns is called ``a a`` and you want - to sum it with ``b``, your query should be ```a a` + b``. - - See the documentation for :func:`eval` for full details of - supported operations and functions in the expression string. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4689,7 +4867,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by - :meth:`~pandas.DataFrame.eval`. + :meth:`~pandas.DataFrame.query`. Returns ------- @@ -4713,17 +4891,15 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Examples -------- - >>> df = pd.DataFrame( - ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} - ... ) + >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df - A B C&C - 0 1 10 10 - 1 2 8 9 - 2 3 6 8 - 3 4 4 7 - 4 5 2 6 - >>> df.eval("A + B") + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B') 0 11 1 10 2 9 @@ -4734,56 +4910,35 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Assignment is allowed though by default the original DataFrame is not modified. - >>> df.eval("D = A + B") - A B C&C D - 0 1 10 10 11 - 1 2 8 9 10 - 2 3 6 8 9 - 3 4 4 7 8 - 4 5 2 6 7 + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 >>> df - A B C&C - 0 1 10 10 - 1 2 8 9 - 2 3 6 8 - 3 4 4 7 - 4 5 2 6 + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' - ... D = A + B - ... E = A - B + ... C = A + B + ... D = A - B ... ''' ... ) - A B C&C D E - 0 1 10 10 11 -9 - 1 2 8 9 10 -6 - 2 3 6 8 9 -3 - 3 4 4 7 8 0 - 4 5 2 6 7 3 - - For columns with spaces or other disallowed characters in their name, you can - use backtick quoting. - - >>> df.eval("B * `C&C`") - 0 100 - 1 72 - 2 48 - 3 28 - 4 12 - - Local variables shall be explicitly referenced using ``@`` - character in front of the name: - - >>> local_var = 2 - >>> df.eval("@local_var * A") - 0 2 - 1 4 - 2 6 - 3 8 - 4 10 + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 """ from pandas.core.computation.eval import eval as _eval @@ -4798,14 +4953,10 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None) -> DataFrame: + def select_dtypes(self, include=None, exclude=None) -> Self: """ Return a subset of the DataFrame's columns based on the column dtypes. - This method allows for filtering columns based on their data types. - It is useful when working with heterogeneous DataFrames where operations - need to be performed on a specific subset of data types. - Parameters ---------- include, exclude : scalar or list-like @@ -4823,7 +4974,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ValueError * If both of ``include`` and ``exclude`` are empty * If ``include`` and ``exclude`` have overlapping elements - TypeError * If any kind of string dtype is passed in. See Also @@ -4849,9 +4999,9 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: Examples -------- - >>> df = pd.DataFrame( - ... {"a": [1, 2] * 3, "b": [True, False] * 3, "c": [1.0, 2.0] * 3} - ... ) + >>> df = pd.DataFrame({'a': [1, 2] * 3, + ... 'b': [True, False] * 3, + ... 'c': [1.0, 2.0] * 3}) >>> df a b c 0 1 True 1.0 @@ -4861,7 +5011,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: 4 1 True 1.0 5 2 False 2.0 - >>> df.select_dtypes(include="bool") + >>> df.select_dtypes(include='bool') b 0 True 1 False @@ -4870,7 +5020,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: 4 True 5 False - >>> df.select_dtypes(include=["float64"]) + >>> df.select_dtypes(include=['float64']) c 0 1.0 1 2.0 @@ -4879,7 +5029,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: 4 1.0 5 2.0 - >>> df.select_dtypes(exclude=["int64"]) + >>> df.select_dtypes(exclude=['int64']) b c 0 True 1.0 1 False 2.0 @@ -4945,14 +5095,15 @@ def predicate(arr: ArrayLike) -> bool: return True - mgr = self._mgr._get_data_subset(predicate).copy(deep=False) - return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) + mgr = self._mgr._get_data_subset(predicate).copy(deep=None) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value] def insert( self, loc: int, column: Hashable, - value: object, + value: Scalar | AnyArrayLike, allow_duplicates: bool | lib.NoDefault = lib.no_default, ) -> None: """ @@ -4978,7 +5129,7 @@ def insert( Examples -------- - >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 0 1 3 @@ -5036,7 +5187,7 @@ def assign(self, **kwargs) -> DataFrame: Parameters ---------- - **kwargs : callable or Series + **kwargs : dict of {str: callable or Series} The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not @@ -5050,11 +5201,6 @@ def assign(self, **kwargs) -> DataFrame: A new DataFrame with the new columns in addition to all the existing columns. - See Also - -------- - DataFrame.loc : Select a subset of a DataFrame by labels. - DataFrame.iloc : Select a subset of a DataFrame by positions. - Notes ----- Assigning multiple columns within the same ``assign`` is possible. @@ -5063,7 +5209,8 @@ def assign(self, **kwargs) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({"temp_c": [17.0, 25.0]}, index=["Portland", "Berkeley"]) + >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, + ... index=['Portland', 'Berkeley']) >>> df temp_c Portland 17.0 @@ -5079,7 +5226,7 @@ def assign(self, **kwargs) -> DataFrame: Alternatively, the same behavior can be achieved by directly referencing an existing Series or sequence: - >>> df.assign(temp_f=df["temp_c"] * 9 / 5 + 32) + >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 Berkeley 25.0 77.0 @@ -5087,15 +5234,13 @@ def assign(self, **kwargs) -> DataFrame: You can create multiple columns within the same assign where one of the columns depends on another one defined within the same assign: - >>> df.assign( - ... temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, - ... temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, - ... ) + >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, + ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) temp_c temp_f temp_k Portland 17.0 62.6 290.15 Berkeley 25.0 77.0 298.15 """ - data = self.copy(deep=False) + data = self.copy(deep=None) for k, v in kwargs.items(): data[k] = com.apply_if_callable(v, data) @@ -5126,7 +5271,22 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True), None + arr = sanitize_array(value, self.index, copy=True, allow_2d=True) + if ( + isinstance(value, Index) + and value.dtype == "object" + and arr.dtype != value.dtype + ): # + # TODO: Remove kludge in sanitize_array for string mode when enforcing + # this deprecation + warnings.warn( + "Setting an Index with object dtype into a DataFrame will stop " + "inferring another dtype in a future version. Cast the Index " + "explicitly before setting it into the DataFrame.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return arr, None @property def _series(self): @@ -5135,7 +5295,9 @@ def _series(self): # ---------------------------------------------------------------------- # Reindexing and alignment - def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: + def _reindex_multi( + self, axes: dict[str, Index], copy: bool, fill_value + ) -> DataFrame: """ We are guaranteed non-Nones in the axes. """ @@ -5157,6 +5319,7 @@ def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: else: return self._reindex_with_indexers( {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, + copy=copy, fill_value=fill_value, ) @@ -5196,7 +5359,7 @@ def set_axis( labels, *, axis: Axis = 0, - copy: bool | lib.NoDefault = lib.no_default, + copy: bool | None = None, ) -> DataFrame: return super().set_axis(labels, axis=axis, copy=copy) @@ -5213,7 +5376,7 @@ def reindex( columns=None, axis: Axis | None = None, method: ReindexMethod | None = None, - copy: bool | lib.NoDefault = lib.no_default, + copy: bool | None = None, level: Level | None = None, fill_value: Scalar | None = np.nan, limit: int | None = None, @@ -5225,59 +5388,62 @@ def reindex( columns=columns, axis=axis, method=method, + copy=copy, level=level, fill_value=fill_value, limit=limit, tolerance=tolerance, - copy=copy, ) @overload def drop( self, - labels: IndexLabel | ListLike = ..., + labels: IndexLabel = ..., *, axis: Axis = ..., - index: IndexLabel | ListLike = ..., - columns: IndexLabel | ListLike = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., level: Level = ..., inplace: Literal[True], errors: IgnoreRaise = ..., - ) -> None: ... + ) -> None: + ... @overload def drop( self, - labels: IndexLabel | ListLike = ..., + labels: IndexLabel = ..., *, axis: Axis = ..., - index: IndexLabel | ListLike = ..., - columns: IndexLabel | ListLike = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., level: Level = ..., inplace: Literal[False] = ..., errors: IgnoreRaise = ..., - ) -> DataFrame: ... + ) -> DataFrame: + ... @overload def drop( self, - labels: IndexLabel | ListLike = ..., + labels: IndexLabel = ..., *, axis: Axis = ..., - index: IndexLabel | ListLike = ..., - columns: IndexLabel | ListLike = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., level: Level = ..., inplace: bool = ..., errors: IgnoreRaise = ..., - ) -> DataFrame | None: ... + ) -> DataFrame | None: + ... def drop( self, - labels: IndexLabel | ListLike = None, + labels: IndexLabel | None = None, *, axis: Axis = 0, - index: IndexLabel | ListLike = None, - columns: IndexLabel | ListLike = None, + index: IndexLabel | None = None, + columns: IndexLabel | None = None, level: Level | None = None, inplace: bool = False, errors: IgnoreRaise = "raise", @@ -5293,16 +5459,16 @@ def drop( Parameters ---------- - labels : single label or iterable of labels + labels : single label or list-like Index or column labels to drop. A tuple will be used as a single - label and not treated as an iterable. + label and not treated as a list-like. axis : {0 or 'index', 1 or 'columns'}, default 0 Whether to drop labels from the index (0 or 'index') or columns (1 or 'columns'). - index : single label or iterable of labels + index : single label or list-like Alternative to specifying axis (``labels, axis=0`` is equivalent to ``index=labels``). - columns : single label or iterable of labels + columns : single label or list-like Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). level : int or level name, optional @@ -5336,7 +5502,8 @@ def drop( Examples -------- - >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"]) + >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) >>> df A B C D 0 0 1 2 3 @@ -5345,13 +5512,13 @@ def drop( Drop columns - >>> df.drop(["B", "C"], axis=1) + >>> df.drop(['B', 'C'], axis=1) A D 0 0 3 1 4 7 2 8 11 - >>> df.drop(columns=["B", "C"]) + >>> df.drop(columns=['B', 'C']) A D 0 0 3 1 4 7 @@ -5365,25 +5532,14 @@ def drop( Drop columns and/or rows of MultiIndex DataFrame - >>> midx = pd.MultiIndex( - ... levels=[["llama", "cow", "falcon"], ["speed", "weight", "length"]], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - ... ) - >>> df = pd.DataFrame( - ... index=midx, - ... columns=["big", "small"], - ... data=[ - ... [45, 30], - ... [200, 100], - ... [1.5, 1], - ... [30, 20], - ... [250, 150], - ... [1.5, 0.8], - ... [320, 250], - ... [1, 0.8], - ... [0.3, 0.2], - ... ], - ... ) + >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], + ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], + ... [250, 150], [1.5, 0.8], [320, 250], + ... [1, 0.8], [0.3, 0.2]]) >>> df big small llama speed 45.0 30.0 @@ -5400,7 +5556,7 @@ def drop( DataFrame, i.e., drop the combination ``'falcon'`` and ``'weight'``, which deletes only the corresponding row - >>> df.drop(index=("falcon", "weight")) + >>> df.drop(index=('falcon', 'weight')) big small llama speed 45.0 30.0 weight 200.0 100.0 @@ -5411,7 +5567,7 @@ def drop( falcon speed 320.0 250.0 length 0.3 0.2 - >>> df.drop(index="cow", columns="small") + >>> df.drop(index='cow', columns='small') big llama speed 45.0 weight 200.0 @@ -5420,7 +5576,7 @@ def drop( weight 1.0 length 0.3 - >>> df.drop(index="length", level=1) + >>> df.drop(index='length', level=1) big small llama speed 45.0 30.0 weight 200.0 100.0 @@ -5447,11 +5603,12 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool | lib.NoDefault = lib.no_default, + copy: bool | None = ..., inplace: Literal[True], level: Level = ..., errors: IgnoreRaise = ..., - ) -> None: ... + ) -> None: + ... @overload def rename( @@ -5461,11 +5618,12 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool | lib.NoDefault = lib.no_default, + copy: bool | None = ..., inplace: Literal[False] = ..., level: Level = ..., errors: IgnoreRaise = ..., - ) -> DataFrame: ... + ) -> DataFrame: + ... @overload def rename( @@ -5475,11 +5633,12 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool | lib.NoDefault = lib.no_default, + copy: bool | None = ..., inplace: bool = ..., level: Level = ..., errors: IgnoreRaise = ..., - ) -> DataFrame | None: ... + ) -> DataFrame | None: + ... def rename( self, @@ -5488,7 +5647,7 @@ def rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool | lib.NoDefault = lib.no_default, + copy: bool | None = None, inplace: bool = False, level: Level | None = None, errors: IgnoreRaise = "ignore", @@ -5518,7 +5677,7 @@ def rename( axis : {0 or 'index', 1 or 'columns'}, default 0 Axis to target with ``mapper``. Can be either the axis name ('index', 'columns') or number (0, 1). The default is 'index'. - copy : bool, default False + copy : bool, default True Also copy underlying data. .. note:: @@ -5532,8 +5691,6 @@ def rename( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` - - .. deprecated:: 3.0.0 inplace : bool, default False Whether to modify the DataFrame rather than creating a new one. If True then value of copy is ignored. @@ -5602,24 +5759,24 @@ def rename( Using axis-style parameters: - >>> df.rename(str.lower, axis="columns") + >>> df.rename(str.lower, axis='columns') a b 0 1 4 1 2 5 2 3 6 - >>> df.rename({1: 2, 2: 4}, axis="index") + >>> df.rename({1: 2, 2: 4}, axis='index') A B 0 1 4 2 2 5 4 3 6 """ - self._check_copy_deprecation(copy) return super()._rename( mapper=mapper, index=index, columns=columns, axis=axis, + copy=copy, inplace=inplace, level=level, errors=errors, @@ -5627,7 +5784,7 @@ def rename( def pop(self, item: Hashable) -> Series: """ - Return item and drop it from DataFrame. Raise KeyError if not found. + Return item and drop from frame. Raise KeyError if not found. Parameters ---------- @@ -5637,24 +5794,14 @@ def pop(self, item: Hashable) -> Series: Returns ------- Series - Series representing the item that is dropped. - - See Also - -------- - DataFrame.drop: Drop specified labels from rows or columns. - DataFrame.drop_duplicates: Return DataFrame with duplicate rows removed. Examples -------- - >>> df = pd.DataFrame( - ... [ - ... ("falcon", "bird", 389.0), - ... ("parrot", "bird", 24.0), - ... ("lion", "mammal", 80.5), - ... ("monkey", "mammal", np.nan), - ... ], - ... columns=("name", "class", "max_speed"), - ... ) + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=('name', 'class', 'max_speed')) >>> df name class max_speed 0 falcon bird 389.0 @@ -5662,7 +5809,7 @@ def pop(self, item: Hashable) -> Series: 2 lion mammal 80.5 3 monkey mammal NaN - >>> df.pop("class") + >>> df.pop('class') 0 bird 1 bird 2 mammal @@ -5678,19 +5825,9 @@ def pop(self, item: Hashable) -> Series: """ return super().pop(item=item) - @overload - def _replace_columnwise( - self, mapping: dict[Hashable, tuple[Any, Any]], inplace: Literal[True], regex - ) -> None: ... - - @overload - def _replace_columnwise( - self, mapping: dict[Hashable, tuple[Any, Any]], inplace: Literal[False], regex - ) -> Self: ... - def _replace_columnwise( self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex - ) -> Self | None: + ): """ Dispatch to Series.replace column-wise. @@ -5706,7 +5843,7 @@ def _replace_columnwise( DataFrame or None """ # Operate column-wise - res = self if inplace else self.copy(deep=False) + res = self if inplace else self.copy(deep=None) ax = self.columns for i, ax_value in enumerate(ax): @@ -5719,7 +5856,7 @@ def _replace_columnwise( res._iset_item(i, newobj, inplace=inplace) if inplace: - return None + return return res.__finalize__(self) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) @@ -5733,11 +5870,16 @@ def shift( ) -> DataFrame: if freq is not None and fill_value is not lib.no_default: # GH#53832 - raise ValueError( - "Passing a 'freq' together with a 'fill_value' is not allowed." + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), ) + fill_value = lib.no_default - if self.empty and freq is None: + if self.empty: return self.copy() axis = self._get_axis_number(axis) @@ -5770,6 +5912,7 @@ def shift( periods = cast(int, periods) ncols = len(self.columns) + arrays = self._mgr.arrays if axis == 1 and periods != 0 and ncols > 0 and freq is None: if fill_value is lib.no_default: # We will infer fill_value to match the closest column @@ -5795,12 +5938,12 @@ def shift( result.columns = self.columns.copy() return result - elif len(self._mgr.blocks) > 1 or ( + elif len(arrays) > 1 or ( # If we only have one block and we know that we can't # keep the same dtype (i.e. the _can_hold_element check) # then we can go through the reindex_indexer path # (and avoid casting logic in the Block method). - not can_hold_element(self._mgr.blocks[0].values, fill_value) + not can_hold_element(arrays[0], fill_value) ): # GH#35488 we need to watch out for multi-block cases # We only get here with fill_value not-lib.no_default @@ -5839,7 +5982,8 @@ def set_index( append: bool = ..., inplace: Literal[False] = ..., verify_integrity: bool = ..., - ) -> DataFrame: ... + ) -> DataFrame: + ... @overload def set_index( @@ -5850,7 +5994,8 @@ def set_index( append: bool = ..., inplace: Literal[True], verify_integrity: bool = ..., - ) -> None: ... + ) -> None: + ... def set_index( self, @@ -5880,8 +6025,6 @@ def set_index( Delete columns to be used as the new index. append : bool, default False Whether to append columns to existing index. - Setting to True will add the new columns to existing index. - When set to False, the current index will be dropped from the DataFrame. inplace : bool, default False Whether to modify the DataFrame rather than creating a new one. verify_integrity : bool, default False @@ -5902,13 +6045,9 @@ def set_index( Examples -------- - >>> df = pd.DataFrame( - ... { - ... "month": [1, 4, 7, 10], - ... "year": [2012, 2014, 2013, 2014], - ... "sale": [55, 40, 84, 31], - ... } - ... ) + >>> df = pd.DataFrame({'month': [1, 4, 7, 10], + ... 'year': [2012, 2014, 2013, 2014], + ... 'sale': [55, 40, 84, 31]}) >>> df month year sale 0 1 2012 55 @@ -5918,7 +6057,7 @@ def set_index( Set the index to become the 'month' column: - >>> df.set_index("month") + >>> df.set_index('month') year sale month 1 2012 55 @@ -5928,7 +6067,7 @@ def set_index( Create a MultiIndex using columns 'year' and 'month': - >>> df.set_index(["year", "month"]) + >>> df.set_index(['year', 'month']) sale year month 2012 1 55 @@ -5938,7 +6077,7 @@ def set_index( Create a MultiIndex using an Index and a column: - >>> df.set_index([pd.Index([1, 2, 3, 4]), "year"]) + >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) month sale year 1 2012 1 55 @@ -5955,25 +6094,6 @@ def set_index( 2 4 4 2014 40 3 9 7 2013 84 4 16 10 2014 31 - - Append a column to the existing index: - - >>> df = df.set_index("month") - >>> df.set_index("year", append=True) - sale - month year - 1 2012 55 - 4 2014 40 - 7 2013 84 - 10 2014 31 - - >>> df.set_index("year", append=False) - sale - year - 2012 55 - 2014 40 - 2013 84 - 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") self._check_inplace_and_allows_duplicate_labels(inplace) @@ -6011,7 +6131,8 @@ def set_index( if inplace: frame = self else: - frame = self.copy(deep=False) + # GH 49473 Use "lazy copy" with Copy-on-Write + frame = self.copy(deep=None) arrays: list[Index] = [] names: list[Hashable] = [] @@ -6024,7 +6145,7 @@ def set_index( else: arrays.append(self.index) - to_remove: set[Hashable] = set() + to_remove: list[Hashable] = [] for col in keys: if isinstance(col, MultiIndex): arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) @@ -6051,7 +6172,7 @@ def set_index( arrays.append(frame[col]) names.append(col) if drop: - to_remove.add(col) + to_remove.append(col) if len(arrays[-1]) != len(self): # check newest element against length of calling frame, since @@ -6068,7 +6189,7 @@ def set_index( raise ValueError(f"Index has duplicate keys: {duplicates}") # use set to handle duplicate column names gracefully in case of drop - for c in to_remove: + for c in set(to_remove): del frame[c] # clear up memory usage @@ -6091,7 +6212,8 @@ def reset_index( col_fill: Hashable = ..., allow_duplicates: bool | lib.NoDefault = ..., names: Hashable | Sequence[Hashable] | None = None, - ) -> DataFrame: ... + ) -> DataFrame: + ... @overload def reset_index( @@ -6104,7 +6226,8 @@ def reset_index( col_fill: Hashable = ..., allow_duplicates: bool | lib.NoDefault = ..., names: Hashable | Sequence[Hashable] | None = None, - ) -> None: ... + ) -> None: + ... @overload def reset_index( @@ -6117,7 +6240,8 @@ def reset_index( col_fill: Hashable = ..., allow_duplicates: bool | lib.NoDefault = ..., names: Hashable | Sequence[Hashable] | None = None, - ) -> DataFrame | None: ... + ) -> DataFrame | None: + ... def reset_index( self, @@ -6161,8 +6285,8 @@ def reset_index( names : int, str or 1-dimensional list, default None Using the given string, rename the DataFrame column which contains the - index data. If the DataFrame has a MultiIndex, this has to be a list - with length equal to the number of levels. + index data. If the DataFrame has a MultiIndex, this has to be a list or + tuple with length equal to the number of levels. .. versionadded:: 1.5.0 @@ -6179,11 +6303,12 @@ def reset_index( Examples -------- - >>> df = pd.DataFrame( - ... [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)], - ... index=["falcon", "parrot", "lion", "monkey"], - ... columns=("class", "max_speed"), - ... ) + >>> df = pd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) >>> df class max_speed falcon bird 389.0 @@ -6213,21 +6338,19 @@ class max_speed You can also use `reset_index` with `MultiIndex`. - >>> index = pd.MultiIndex.from_tuples( - ... [ - ... ("bird", "falcon"), - ... ("bird", "parrot"), - ... ("mammal", "lion"), - ... ("mammal", "monkey"), - ... ], - ... names=["class", "name"], - ... ) - >>> columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")]) - >>> df = pd.DataFrame( - ... [(389.0, "fly"), (24.0, "fly"), (80.5, "run"), (np.nan, "jump")], - ... index=index, - ... columns=columns, - ... ) + >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), + ... ('species', 'type')]) + >>> df = pd.DataFrame([(389.0, 'fly'), + ... (24.0, 'fly'), + ... (80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=columns) >>> df speed species max type @@ -6239,7 +6362,7 @@ class name Using the `names` parameter, choose a name for the index column: - >>> df.reset_index(names=["classes", "names"]) + >>> df.reset_index(names=['classes', 'names']) classes names speed species max type 0 bird falcon 389.0 fly @@ -6249,7 +6372,7 @@ class name If the index has multiple levels, we can reset a subset of them: - >>> df.reset_index(level="class") + >>> df.reset_index(level='class') class speed species max type name @@ -6261,7 +6384,7 @@ class speed species If we are not dropping the index, by default, it is placed in the top level. We can place it in another level: - >>> df.reset_index(level="class", col_level=1) + >>> df.reset_index(level='class', col_level=1) speed species class max type name @@ -6273,7 +6396,7 @@ class max type When the index is inserted under another level, we can specify under which one with the parameter `col_fill`: - >>> df.reset_index(level="class", col_level=1, col_fill="species") + >>> df.reset_index(level='class', col_level=1, col_fill='species') species speed species class max type name @@ -6284,7 +6407,7 @@ class max type If we specify a nonexistent level for `col_fill`, it is created: - >>> df.reset_index(level="class", col_level=1, col_fill="genus") + >>> df.reset_index(level='class', col_level=1, col_fill='genus') genus speed species class max type name @@ -6298,7 +6421,7 @@ class max type if inplace: new_obj = self else: - new_obj = self.copy(deep=False) + new_obj = self.copy(deep=None) if allow_duplicates is not lib.no_default: allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") @@ -6317,13 +6440,12 @@ class max type names = self.index._get_default_index_names(names, default) if isinstance(self.index, MultiIndex): - to_insert = zip(reversed(self.index.levels), reversed(self.index.codes)) + to_insert = zip(self.index.levels, self.index.codes) else: to_insert = ((self.index, None),) multi_col = isinstance(self.columns, MultiIndex) - for j, (lev, lab) in enumerate(to_insert, start=1): - i = self.index.nlevels - j + for i, (lev, lab) in reversed(list(enumerate(to_insert))): if level is not None and i not in level: continue name = names[i] @@ -6404,7 +6526,8 @@ def dropna( subset: IndexLabel = ..., inplace: Literal[False] = ..., ignore_index: bool = ..., - ) -> DataFrame: ... + ) -> DataFrame: + ... @overload def dropna( @@ -6416,7 +6539,8 @@ def dropna( subset: IndexLabel = ..., inplace: Literal[True], ignore_index: bool = ..., - ) -> None: ... + ) -> None: + ... def dropna( self, @@ -6424,7 +6548,7 @@ def dropna( axis: Axis = 0, how: AnyAll | lib.NoDefault = lib.no_default, thresh: int | lib.NoDefault = lib.no_default, - subset: IndexLabel | AnyArrayLike | None = None, + subset: IndexLabel | None = None, inplace: bool = False, ignore_index: bool = False, ) -> DataFrame | None: @@ -6454,7 +6578,7 @@ def dropna( thresh : int, optional Require that many non-NA values. Cannot be combined with how. - subset : column label or iterable of labels, optional + subset : column label or sequence of labels, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. inplace : bool, default False @@ -6479,13 +6603,10 @@ def dropna( Examples -------- - >>> df = pd.DataFrame( - ... { - ... "name": ["Alfred", "Batman", "Catwoman"], - ... "toy": [np.nan, "Batmobile", "Bullwhip"], - ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT], - ... } - ... ) + >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], + ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], + ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), + ... pd.NaT]}) >>> df name toy born 0 Alfred NaN NaT @@ -6500,7 +6621,7 @@ def dropna( Drop the columns where at least one element is missing. - >>> df.dropna(axis="columns") + >>> df.dropna(axis='columns') name 0 Alfred 1 Batman @@ -6508,7 +6629,7 @@ def dropna( Drop the rows where all elements are missing. - >>> df.dropna(how="all") + >>> df.dropna(how='all') name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 @@ -6523,11 +6644,32 @@ def dropna( Define in which columns to look for missing values. - >>> df.dropna(subset=["name", "toy"]) + >>> df.dropna(subset=['name', 'toy']) name toy born 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT """ + + def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): + if subset is not None and isinstance(subset, str): + subset = [subset] + + print("DEBUG: dropna called") + + if axis in (0, "index"): + cols = self.columns if subset is None else subset + for col in cols: + seen_types = set() + for val in self[col]: + if val is not None: + seen_types.add(type(val)) + if len(seen_types) > 1: + raise TypeError( + f"Mixed data types detected in column '{col}': dropna() requires uniform column types." + ) + + + if (how is not lib.no_default) and (thresh is not lib.no_default): raise TypeError( "You cannot set both the how and thresh arguments at the same time." @@ -6548,7 +6690,7 @@ def dropna( if subset is not None: # subset needs to be list if not is_list_like(subset): - subset = [cast(Hashable, subset)] + subset = [subset] ax = self._get_axis(agg_axis) indices = ax.get_indexer_for(subset) check = indices == -1 @@ -6569,7 +6711,7 @@ def dropna( raise ValueError(f"invalid how option: {how}") if np.all(mask): - result = self.copy(deep=False) + result = self.copy(deep=None) else: result = self.loc(axis=axis)[mask] @@ -6584,36 +6726,39 @@ def dropna( @overload def drop_duplicates( self, - subset: Hashable | Iterable[Hashable] | None = ..., + subset: Hashable | Sequence[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: Literal[True], ignore_index: bool = ..., - ) -> None: ... + ) -> None: + ... @overload def drop_duplicates( self, - subset: Hashable | Iterable[Hashable] | None = ..., + subset: Hashable | Sequence[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: Literal[False] = ..., ignore_index: bool = ..., - ) -> DataFrame: ... + ) -> DataFrame: + ... @overload def drop_duplicates( self, - subset: Hashable | Iterable[Hashable] | None = ..., + subset: Hashable | Sequence[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: bool = ..., ignore_index: bool = ..., - ) -> DataFrame | None: ... + ) -> DataFrame | None: + ... def drop_duplicates( self, - subset: Hashable | Iterable[Hashable] | None = None, + subset: Hashable | Sequence[Hashable] | None = None, *, keep: DropKeep = "first", inplace: bool = False, @@ -6627,7 +6772,7 @@ def drop_duplicates( Parameters ---------- - subset : column label or iterable of labels, optional + subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', ``False``}, default 'first' @@ -6651,22 +6796,15 @@ def drop_duplicates( -------- DataFrame.value_counts: Count unique combinations of columns. - Notes - ----- - This method requires columns specified by ``subset`` to be of hashable type. - Passing unhashable columns will raise a ``TypeError``. - Examples -------- Consider dataset containing ramen rating. - >>> df = pd.DataFrame( - ... { - ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], - ... "style": ["cup", "cup", "cup", "pack", "pack"], - ... "rating": [4, 4, 3.5, 15, 5], - ... } - ... ) + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) >>> df brand style rating 0 Yum Yum cup 4.0 @@ -6686,21 +6824,21 @@ def drop_duplicates( To remove duplicates on specific column(s), use ``subset``. - >>> df.drop_duplicates(subset=["brand"]) + >>> df.drop_duplicates(subset=['brand']) brand style rating 0 Yum Yum cup 4.0 2 Indomie cup 3.5 To remove duplicates and keep last occurrences, use ``keep``. - >>> df.drop_duplicates(subset=["brand", "style"], keep="last") + >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') brand style rating 1 Yum Yum cup 4.0 2 Indomie cup 3.5 4 Indomie pack 5.0 """ if self.empty: - return self.copy(deep=False) + return self.copy(deep=None) inplace = validate_bool_kwarg(inplace, "inplace") ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") @@ -6717,7 +6855,7 @@ def drop_duplicates( def duplicated( self, - subset: Hashable | Iterable[Hashable] | None = None, + subset: Hashable | Sequence[Hashable] | None = None, keep: DropKeep = "first", ) -> Series: """ @@ -6727,7 +6865,7 @@ def duplicated( Parameters ---------- - subset : column label or iterable of labels, optional + subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', False}, default 'first' @@ -6753,13 +6891,11 @@ def duplicated( -------- Consider dataset containing ramen rating. - >>> df = pd.DataFrame( - ... { - ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], - ... "style": ["cup", "cup", "cup", "pack", "pack"], - ... "rating": [4, 4, 3.5, 15, 5], - ... } - ... ) + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) >>> df brand style rating 0 Yum Yum cup 4.0 @@ -6782,7 +6918,7 @@ def duplicated( By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True. - >>> df.duplicated(keep="last") + >>> df.duplicated(keep='last') 0 True 1 False 2 False @@ -6802,7 +6938,7 @@ def duplicated( To find duplicates on specific column(s), use ``subset``. - >>> df.duplicated(subset=["brand"]) + >>> df.duplicated(subset=['brand']) 0 False 1 True 2 False @@ -6816,14 +6952,18 @@ def duplicated( def f(vals) -> tuple[np.ndarray, int]: labels, shape = algorithms.factorize(vals, size_hint=len(self)) - return labels.astype("i8"), len(shape) + return labels.astype("i8", copy=False), len(shape) if subset is None: - subset = self.columns + # https://github.com/pandas-dev/pandas/issues/28770 + # Incompatible types in assignment (expression has type "Index", variable + # has type "Sequence[Any]") + subset = self.columns # type: ignore[assignment] elif ( not np.iterable(subset) or isinstance(subset, str) - or (isinstance(subset, tuple) and subset in self.columns) + or isinstance(subset, tuple) + and subset in self.columns ): subset = (subset,) @@ -6839,7 +6979,7 @@ def f(vals) -> tuple[np.ndarray, int]: if len(subset) == 1 and self.columns.is_unique: # GH#45236 This is faster than get_group_index below - result = self[next(iter(subset))].duplicated(keep) + result = self[subset[0]].duplicated(keep) result.name = None else: vals = (col.values for name, col in self.items() if name in subset) @@ -6864,7 +7004,8 @@ def sort_values( na_position: NaPosition = ..., ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> DataFrame: ... + ) -> DataFrame: + ... @overload def sort_values( @@ -6878,7 +7019,8 @@ def sort_values( na_position: str = ..., ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> None: ... + ) -> None: + ... def sort_values( self, @@ -6928,8 +7070,7 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. The values in the - returned Series will be used as the keys for sorting. + It will be applied to each column in `by` independently. Returns ------- @@ -6943,14 +7084,12 @@ def sort_values( Examples -------- - >>> df = pd.DataFrame( - ... { - ... "col1": ["A", "A", "B", np.nan, "D", "C"], - ... "col2": [2, 1, 9, 8, 7, 4], - ... "col3": [0, 1, 9, 4, 2, 3], - ... "col4": ["a", "B", "c", "D", "e", "F"], - ... } - ... ) + >>> df = pd.DataFrame({ + ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], + ... 'col2': [2, 1, 9, 8, 7, 4], + ... 'col3': [0, 1, 9, 4, 2, 3], + ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] + ... }) >>> df col1 col2 col3 col4 0 A 2 0 a @@ -6960,11 +7099,9 @@ def sort_values( 4 D 7 2 e 5 C 4 3 F - **Sort by a single column** + Sort by col1 - In this case, we are sorting the rows according to values in ``col1``: - - >>> df.sort_values(by=["col1"]) + >>> df.sort_values(by=['col1']) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -6973,14 +7110,9 @@ def sort_values( 4 D 7 2 e 3 NaN 8 4 D - **Sort by multiple columns** - - You can also provide multiple columns to ``by`` argument, as shown below. - In this example, the rows are first sorted according to ``col1``, and then - the rows that have an identical value in ``col1`` are sorted according - to ``col2``. + Sort by multiple columns - >>> df.sort_values(by=["col1", "col2"]) + >>> df.sort_values(by=['col1', 'col2']) col1 col2 col3 col4 1 A 1 1 B 0 A 2 0 a @@ -6989,11 +7121,9 @@ def sort_values( 4 D 7 2 e 3 NaN 8 4 D - **Sort in a descending order** + Sort Descending - The sort order can be reversed using ``ascending`` argument, as shown below: - - >>> df.sort_values(by="col1", ascending=False) + >>> df.sort_values(by='col1', ascending=False) col1 col2 col3 col4 4 D 7 2 e 5 C 4 3 F @@ -7002,13 +7132,9 @@ def sort_values( 1 A 1 1 B 3 NaN 8 4 D - **Placing any** ``NA`` **first** - - Note that in the above example, the rows that contain an ``NA`` value in their - ``col1`` are placed at the end of the dataframe. This behavior can be modified - via ``na_position`` argument, as shown below: + Putting NAs first - >>> df.sort_values(by="col1", ascending=False, na_position="first") + >>> df.sort_values(by='col1', ascending=False, na_position='first') col1 col2 col3 col4 3 NaN 8 4 D 4 D 7 2 e @@ -7017,14 +7143,9 @@ def sort_values( 0 A 2 0 a 1 A 1 1 B - **Customized sort order** + Sorting with a key function - The ``key`` argument allows for a further customization of sorting behaviour. - For example, you may want - to ignore the `letter's case `__ - when sorting strings: - - >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) + >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -7033,19 +7154,13 @@ def sort_values( 4 D 7 2 e 5 C 4 3 F - Another typical example is - `natural sorting `__. - This can be done using - ``natsort`` `package `__, - which provides sorted indices according - to their natural order, as shown below: + Natural sort with the key argument, + using the `natsort ` package. - >>> df = pd.DataFrame( - ... { - ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], - ... "value": [10, 20, 30, 40, 50], - ... } - ... ) + >>> df = pd.DataFrame({ + ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], + ... "value": [10, 20, 30, 40, 50] + ... }) >>> df time value 0 0hr 10 @@ -7054,11 +7169,9 @@ def sort_values( 3 48hr 40 4 96hr 50 >>> from natsort import index_natsorted - >>> index_natsorted(df["time"]) - [0, 3, 2, 4, 1] >>> df.sort_values( ... by="time", - ... key=lambda x: np.argsort(index_natsorted(x)), + ... key=lambda x: np.argsort(index_natsorted(df["time"])) ... ) time value 0 0hr 10 @@ -7084,19 +7197,19 @@ def sort_values( f" != length of by ({len(by)})" ) if len(by) > 1: - keys = (self._get_label_or_level_values(x, axis=axis) for x in by) + keys = [self._get_label_or_level_values(x, axis=axis) for x in by] # need to rewrap columns in Series to apply key function if key is not None: - keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)] - else: - # error: Argument 1 to "list" has incompatible type - # "Generator[ExtensionArray | ndarray[Any, Any], None, None]"; - # expected "Iterable[Series]" - keys_data = list(keys) # type: ignore[arg-type] + # error: List comprehension has incompatible type List[Series]; + # expected List[ndarray] + keys = [ + Series(k, name=name) # type: ignore[misc] + for (k, name) in zip(keys, by) + ] indexer = lexsort_indexer( - keys_data, orders=ascending, na_position=na_position, key=key + keys, orders=ascending, na_position=na_position, key=key ) elif len(by): # len(by) == 1 @@ -7119,10 +7232,10 @@ def sort_values( if inplace: return self._update_inplace(self) else: - return self.copy(deep=False) + return self.copy(deep=None) if is_range_indexer(indexer, len(indexer)): - result = self.copy(deep=False) + result = self.copy(deep=(not inplace and not using_copy_on_write())) if ignore_index: result.index = default_index(len(result)) @@ -7159,7 +7272,8 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> None: ... + ) -> None: + ... @overload def sort_index( @@ -7174,7 +7288,8 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> DataFrame: ... + ) -> DataFrame: + ... @overload def sort_index( @@ -7189,7 +7304,8 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> DataFrame | None: ... + ) -> DataFrame | None: + ... def sort_index( self, @@ -7256,9 +7372,8 @@ def sort_index( Examples -------- - >>> df = pd.DataFrame( - ... [1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], columns=["A"] - ... ) + >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], + ... columns=['A']) >>> df.sort_index() A 1 4 @@ -7281,7 +7396,7 @@ def sort_index( A key function can be specified which is applied to the index before sorting. For a ``MultiIndex`` this is applied to each level separately. - >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=["A", "b", "C", "d"]) + >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) >>> df.sort_index(key=lambda x: x.str.lower()) a A 1 @@ -7310,7 +7425,7 @@ def value_counts( dropna: bool = True, ) -> Series: """ - Return a Series containing the frequency of each distinct row in the DataFrame. + Return a Series containing the frequency of each distinct row in the Dataframe. Parameters ---------- @@ -7319,22 +7434,17 @@ def value_counts( normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True - Sort by frequencies when True. Preserve the order of the data when False. - - .. versionchanged:: 3.0.0 - - Prior to 3.0.0, ``sort=False`` would sort by the columns values. + Sort by frequencies when True. Sort by DataFrame column values when False. ascending : bool, default False Sort in ascending order. dropna : bool, default True - Do not include counts of rows that contain NA values. + Don't include counts of rows that contain NA values. .. versionadded:: 1.3.0 Returns ------- Series - Series containing the frequency of each distinct row in the DataFrame. See Also -------- @@ -7345,15 +7455,14 @@ def value_counts( The returned Series will have a MultiIndex with one level per input column but an Index (non-multi) for a single label. By default, rows that contain any NA values are omitted from the result. By default, - the resulting Series will be sorted by frequencies in descending order so that - the first element is the most frequently-occurring row. + the resulting Series will be in descending order so that the first + element is the most frequently-occurring row. Examples -------- - >>> df = pd.DataFrame( - ... {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - ... index=["falcon", "dog", "cat", "ant"], - ... ) + >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], + ... 'num_wings': [2, 0, 0, 0]}, + ... index=['falcon', 'dog', 'cat', 'ant']) >>> df num_legs num_wings falcon 2 2 @@ -7391,12 +7500,8 @@ def value_counts( With `dropna` set to `False` we can also count rows with NA values. - >>> df = pd.DataFrame( - ... { - ... "first_name": ["John", "Anne", "John", "Beth"], - ... "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], - ... } - ... ) + >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], + ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) >>> df first_name middle_name 0 John Smith @@ -7429,9 +7534,7 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby( - subset, sort=False, dropna=dropna, observed=False - )._grouper.size() + counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() counts.name = name if sort: @@ -7496,34 +7599,16 @@ def nlargest( Examples -------- - >>> df = pd.DataFrame( - ... { - ... "population": [ - ... 59000000, - ... 65000000, - ... 434000, - ... 434000, - ... 434000, - ... 337000, - ... 11300, - ... 11300, - ... 11300, - ... ], - ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], - ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], - ... }, - ... index=[ - ... "Italy", - ... "France", - ... "Malta", - ... "Maldives", - ... "Brunei", - ... "Iceland", - ... "Nauru", - ... "Tuvalu", - ... "Anguilla", - ... ], - ... ) + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) >>> df population GDP alpha-2 Italy 59000000 1937894 IT @@ -7539,7 +7624,7 @@ def nlargest( In the following example, we will use ``nlargest`` to select the three rows having the largest values in column "population". - >>> df.nlargest(3, "population") + >>> df.nlargest(3, 'population') population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7547,7 +7632,7 @@ def nlargest( When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nlargest(3, "population", keep="last") + >>> df.nlargest(3, 'population', keep='last') population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7557,7 +7642,7 @@ def nlargest( if there are duplicate values for the smallest element, all the ties are kept: - >>> df.nlargest(3, "population", keep="all") + >>> df.nlargest(3, 'population', keep='all') population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7567,7 +7652,7 @@ def nlargest( However, ``nlargest`` does not keep ``n`` distinct largest elements: - >>> df.nlargest(5, "population", keep="all") + >>> df.nlargest(5, 'population', keep='all') population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7578,7 +7663,7 @@ def nlargest( To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. - >>> df.nlargest(3, ["population", "GDP"]) + >>> df.nlargest(3, ['population', 'GDP']) population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7617,7 +7702,6 @@ def nsmallest( Returns ------- DataFrame - DataFrame with the first `n` rows ordered by `columns` in ascending order. See Also -------- @@ -7628,34 +7712,16 @@ def nsmallest( Examples -------- - >>> df = pd.DataFrame( - ... { - ... "population": [ - ... 59000000, - ... 65000000, - ... 434000, - ... 434000, - ... 434000, - ... 337000, - ... 337000, - ... 11300, - ... 11300, - ... ], - ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], - ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], - ... }, - ... index=[ - ... "Italy", - ... "France", - ... "Malta", - ... "Maldives", - ... "Brunei", - ... "Iceland", - ... "Nauru", - ... "Tuvalu", - ... "Anguilla", - ... ], - ... ) + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 337000, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) >>> df population GDP alpha-2 Italy 59000000 1937894 IT @@ -7671,7 +7737,7 @@ def nsmallest( In the following example, we will use ``nsmallest`` to select the three rows having the smallest values in column "population". - >>> df.nsmallest(3, "population") + >>> df.nsmallest(3, 'population') population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7679,7 +7745,7 @@ def nsmallest( When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nsmallest(3, "population", keep="last") + >>> df.nsmallest(3, 'population', keep='last') population GDP alpha-2 Anguilla 11300 311 AI Tuvalu 11300 38 TV @@ -7689,7 +7755,7 @@ def nsmallest( if there are duplicate values for the largest element, all the ties are kept. - >>> df.nsmallest(3, "population", keep="all") + >>> df.nsmallest(3, 'population', keep='all') population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7699,7 +7765,7 @@ def nsmallest( However, ``nsmallest`` does not keep ``n`` distinct smallest elements: - >>> df.nsmallest(4, "population", keep="all") + >>> df.nsmallest(4, 'population', keep='all') population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7709,7 +7775,7 @@ def nsmallest( To order by the smallest values in column "population" and then "GDP", we can specify multiple columns like in the next example. - >>> df.nsmallest(3, ["population", "GDP"]) + >>> df.nsmallest(3, ['population', 'GDP']) population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7717,30 +7783,16 @@ def nsmallest( """ return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() - def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: - """ - Swap levels i and j in a :class:`MultiIndex`. - - Default is to swap the two innermost levels of the index. - - Parameters - ---------- - i, j : int or str - Levels of the indices to be swapped. Can pass level name as string. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to swap levels on. 0 or 'index' for row-wise, 1 or - 'columns' for column-wise. - - Returns - ------- - DataFrame - DataFrame with levels swapped in MultiIndex. - - See Also - -------- - DataFrame.reorder_levels: Reorder levels of MultiIndex. - DataFrame.sort_index: Sort MultiIndex. - + @doc( + Series.swaplevel, + klass=_shared_doc_kwargs["klass"], + extra_params=dedent( + """axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise.""" + ), + examples=dedent( + """\ Examples -------- >>> df = pd.DataFrame( @@ -7790,9 +7842,11 @@ def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: History Final exam January A Geography Final exam February B History Coursework March A - Geography Coursework April C - """ - result = self.copy(deep=False) + Geography Coursework April C""" + ), + ) + def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + result = self.copy(deep=None) axis = self._get_axis_number(axis) @@ -7809,9 +7863,7 @@ def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame: """ - Rearrange index or column levels using input ``order``. - - May not drop or duplicate levels. + Rearrange index levels using input order. May not drop or duplicate levels. Parameters ---------- @@ -7824,13 +7876,8 @@ def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFram Returns ------- DataFrame - DataFrame with indices or columns with reordered levels. - See Also - -------- - DataFrame.swaplevel : Swap levels i and j in a MultiIndex. - - Examples + Examples -------- >>> data = { ... "class": ["Mammals", "Mammals", "Reptiles"], @@ -7859,7 +7906,7 @@ class diet if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover raise TypeError("Can only reorder levels on a hierarchical axis.") - result = self.copy(deep=False) + result = self.copy(deep=None) if axis == 0: assert isinstance(result.index, MultiIndex) @@ -7935,7 +7982,13 @@ def _dispatch_frame_op( # TODO operate_blockwise expects a manager of the same type bm = self._mgr.operate_blockwise( - right._mgr, + # error: Argument 1 to "operate_blockwise" of "ArrayManager" has + # incompatible type "Union[ArrayManager, BlockManager]"; expected + # "ArrayManager" + # error: Argument 1 to "operate_blockwise" of "BlockManager" has + # incompatible type "Union[ArrayManager, BlockManager]"; expected + # "BlockManager" + right._mgr, # type: ignore[arg-type] array_op, ) return self._constructor_from_mgr(bm, axes=bm.axes) @@ -8003,27 +8056,19 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: left = self # GH#31623, only operate on shared columns - cols, lcol_indexer, rcol_indexer = left.columns.join( - right.columns, how="inner", return_indexers=True + cols, lcols, rcols = left.columns.join( + right.columns, how="inner", level=None, return_indexers=True ) - new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] - new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] - - # GH#60498 For MultiIndex column alignment - if isinstance(cols, MultiIndex): - # When overwriting column names, make a shallow copy so as to not modify - # the input DFs - new_left = new_left.copy(deep=False) - new_right = new_right.copy(deep=False) - new_left.columns = cols - new_right.columns = cols - + new_left = left.iloc[:, lcols] + new_right = right.iloc[:, rcols] result = op(new_left, new_right) # Do the join on the columns instead of using left._align_for_op # to avoid constructing two potentially large/sparse DataFrames - join_columns = left.columns.join(right.columns, how="outer") + join_columns, _, _ = left.columns.join( + right.columns, how="outer", level=None, return_indexers=True + ) if result.columns.has_duplicates: # Avoid reindexing with a duplicate axis. @@ -8049,18 +8094,6 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b if not isinstance(right, DataFrame): return False - if ( - ( - isinstance(self.columns, MultiIndex) - or isinstance(right.columns, MultiIndex) - ) - and not self.columns.equals(right.columns) - and fill_value is None - ): - # GH#60498 Reindex if MultiIndexe columns are not matching - # GH#60903 Don't reindex if fill_value is provided - return True - if fill_value is None and level is None and axis == 1: # TODO: any other cases we should handle here? @@ -8184,7 +8217,9 @@ def to_series(right): if flex is not None and isinstance(right, DataFrame): if not left._indexed_same(right): if flex: - left, right = left.align(right, join="outer", level=level) + left, right = left.align( + right, join="outer", level=level, copy=False + ) else: raise ValueError( "Can only compare identically-labeled (both index and columns) " @@ -8197,7 +8232,7 @@ def to_series(right): if not left.axes[axis].equals(right.index): raise ValueError( "Operands are not aligned. Do " - "`left, right = left.align(right, axis=1)` " + "`left, right = left.align(right, axis=1, copy=False)` " "before operating." ) @@ -8206,6 +8241,7 @@ def to_series(right): join="outer", axis=axis, level=level, + copy=False, ) right = left._maybe_align_series_as_frame(right, axis) @@ -8632,8 +8668,8 @@ def combine( -------- Combine using a simple function that chooses the smaller column. - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 >>> df1.combine(df2, take_smaller) A B @@ -8642,8 +8678,8 @@ def combine( Example using a true element-wise combine function. - >>> df1 = pd.DataFrame({"A": [5, 0], "B": [2, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) + >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine(df2, np.minimum) A B 0 1 2 @@ -8652,8 +8688,8 @@ def combine( Using `fill_value` fills Nones prior to passing the column to the merge function. - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine(df2, take_smaller, fill_value=-5) A B 0 0 -5.0 @@ -8662,8 +8698,8 @@ def combine( However, if the same element in both dataframes is None, that None is preserved - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [None, 3]}) + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) >>> df1.combine(df2, take_smaller, fill_value=-5) A B 0 0 -5.0 @@ -8672,14 +8708,8 @@ def combine( Example that demonstrates the use of `overwrite` and behavior when the axis differ between the dataframes. - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]}) - >>> df2 = pd.DataFrame( - ... { - ... "B": [3, 3], - ... "C": [-10, 1], - ... }, - ... index=[1, 2], - ... ) + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) >>> df1.combine(df2, take_smaller) A B C 0 NaN NaN NaN @@ -8694,13 +8724,7 @@ def combine( Demonstrating the preference of the passed in dataframe. - >>> df2 = pd.DataFrame( - ... { - ... "B": [3, 3], - ... "C": [1, 1], - ... }, - ... index=[1, 2], - ... ) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) >>> df2.combine(df1, take_smaller) A B C 0 0.0 NaN NaN @@ -8714,9 +8738,8 @@ def combine( 2 NaN 3.0 1.0 """ other_idxlen = len(other.index) # save for compare - other_columns = other.columns - this, other = self.align(other) + this, other = self.align(other, copy=False) new_index = this.index if other.empty and len(new_index) == len(self.index): @@ -8725,8 +8748,8 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() - # preserve column order - new_columns = self.columns.union(other_columns, sort=False) + # sorts if possible; otherwise align above ensures that these are set-equal + new_columns = this.columns.union(other.columns) do_fill = fill_value is not None result = {} for col in new_columns: @@ -8756,15 +8779,15 @@ def combine( # try to promote series, which is all NaN, as other_dtype. new_dtype = other_dtype try: - series = series.astype(new_dtype) + series = series.astype(new_dtype, copy=False) except ValueError: # e.g. new_dtype is integer types pass else: # if we have different dtypes, possibly promote new_dtype = find_common_type([this_dtype, other_dtype]) - series = series.astype(new_dtype) - other_series = other_series.astype(new_dtype) + series = series.astype(new_dtype, copy=False) + other_series = other_series.astype(new_dtype, copy=False) arr = func(series, other_series) if isinstance(new_dtype, np.dtype): @@ -8811,8 +8834,8 @@ def combine_first(self, other: DataFrame) -> DataFrame: Examples -------- - >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) + >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine_first(df2) A B 0 1.0 3.0 @@ -8821,8 +8844,8 @@ def combine_first(self, other: DataFrame) -> DataFrame: Null values still persist if the location of that null value does not exist in `other` - >>> df1 = pd.DataFrame({"A": [None, 0], "B": [4, None]}) - >>> df2 = pd.DataFrame({"B": [3, 3], "C": [1, 1]}, index=[1, 2]) + >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) >>> df1.combine_first(df2) A B C 0 NaN 4.0 NaN @@ -8919,14 +8942,12 @@ def update( dict.update : Similar method for dictionaries. DataFrame.merge : For column(s)-on-column(s) operations. - Notes - ----- - 1. Duplicate indices on `other` are not supported and raises `ValueError`. - Examples -------- - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]}) - >>> new_df = pd.DataFrame({"B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) >>> df.update(new_df) >>> df A B @@ -8937,8 +8958,9 @@ def update( The DataFrame's length does not increase as a result of the update, only values at matching index/column labels are updated. - >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) - >>> new_df = pd.DataFrame({"B": ["d", "e", "f", "g", "h", "i"]}) + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) >>> df.update(new_df) >>> df A B @@ -8946,8 +8968,9 @@ def update( 1 b e 2 c f - >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) - >>> new_df = pd.DataFrame({"B": ["d", "f"]}, index=[0, 2]) + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2]) >>> df.update(new_df) >>> df A B @@ -8957,8 +8980,9 @@ def update( For Series, its name attribute must be set. - >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) - >>> new_column = pd.Series(["d", "e", "f"], name="B") + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_column = pd.Series(['d', 'e', 'f'], name='B') >>> df.update(new_column) >>> df A B @@ -8969,8 +8993,9 @@ def update( If `other` contains NaNs the corresponding values are not updated in the original dataframe. - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400.0, 500.0, 600.0]}) - >>> new_df = pd.DataFrame({"B": [4, np.nan, 6]}) + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400., 500., 600.]}) + >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) >>> df.update(new_df) >>> df A B @@ -8978,13 +9003,21 @@ def update( 1 2 500.0 2 3 6.0 """ - if not PYPY: + + if not PYPY and using_copy_on_write(): if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) # TODO: Support other joins if join != "left": # pragma: no cover @@ -8995,22 +9028,11 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - if other.index.has_duplicates: - raise ValueError("Update not allowed with duplicate indexes on other.") - - index_intersection = other.index.intersection(self.index) - if index_intersection.empty: - raise ValueError( - "Update not allowed when the index on `other` has no intersection " - "with this dataframe." - ) - - other = other.reindex(index_intersection) - this_data = self.loc[index_intersection] + other = other.reindex(self.index) for col in self.columns.intersection(other.columns): - this = this_data[col] - that = other[col] + this = self[col]._values + that = other[col]._values if filter_func is not None: mask = ~filter_func(this) | isna(that) @@ -9030,7 +9052,17 @@ def update( if mask.all(): continue - self.loc[index_intersection, col] = this.where(mask, that) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Downcasting behavior", + category=FutureWarning, + ) + # GH#57124 - `that` might get upcasted because of NA values, and then + # downcasted in where because of the mask. Ignoring the warning + # is a stopgap, will replace with a new implementation of update + # in 3.0. + self.loc[:, col] = self[col].where(mask, that) # ---------------------------------------------------------------------- # Data reshaping @@ -9085,8 +9117,8 @@ def update( We can also choose to include NA in group keys or not by setting `dropna` parameter, the default setting is `True`. - >>> arr = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - >>> df = pd.DataFrame(arr, columns=["a", "b", "c"]) + >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) >>> df.groupby(by=["b"]).sum() a c @@ -9101,8 +9133,8 @@ def update( 2.0 2 5 NaN 1 4 - >>> arr = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] - >>> df = pd.DataFrame(arr, columns=["a", "b", "c"]) + >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] + >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) >>> df.groupby(by="a").sum() b c @@ -9144,13 +9176,33 @@ def update( def groupby( self, by=None, + axis: Axis | lib.NoDefault = lib.no_default, level: IndexLabel | None = None, as_index: bool = True, sort: bool = True, group_keys: bool = True, - observed: bool = True, + observed: bool | lib.NoDefault = lib.no_default, dropna: bool = True, ) -> DataFrameGroupBy: + if axis is not lib.no_default: + axis = self._get_axis_number(axis) + if axis == 1: + warnings.warn( + "DataFrame.groupby with axis=1 is deprecated. Do " + "`frame.T.groupby(...)` without axis instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + "The 'axis' keyword in DataFrame.groupby is deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + from pandas.core.groupby.generic import DataFrameGroupBy if level is None and by is None: @@ -9159,6 +9211,7 @@ def groupby( return DataFrameGroupBy( obj=self, keys=by, + axis=axis, level=level, as_index=as_index, sort=sort, @@ -9167,7 +9220,9 @@ def groupby( dropna=dropna, ) - _shared_docs["pivot"] = """ + _shared_docs[ + "pivot" + ] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses @@ -9252,11 +9307,11 @@ def groupby( You could also assign a list of column names or a list of index names. >>> df = pd.DataFrame({ - ... "lev1": [1, 1, 1, 2, 2, 2], - ... "lev2": [1, 1, 2, 1, 1, 2], - ... "lev3": [1, 2, 1, 2, 1, 2], - ... "lev4": [1, 2, 3, 4, 5, 6], - ... "values": [0, 1, 2, 3, 4, 5]}) + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5]}) >>> df lev1 lev2 lev3 lev4 values 0 1 1 1 1 0 @@ -9311,7 +9366,9 @@ def pivot( return pivot(self, index=index, columns=columns, values=values) - _shared_docs["pivot_table"] = """ + _shared_docs[ + "pivot_table" + ] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects @@ -9357,20 +9414,16 @@ def pivot( If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. - .. versionchanged:: 3.0.0 + .. deprecated:: 2.2.0 - The default value is now ``True``. + The default value of ``False`` is deprecated and will change to + ``True`` in a future version of pandas. sort : bool, default True Specifies if the result should be sorted. .. versionadded:: 1.3.0 - **kwargs : dict - Optional keyword arguments to pass to ``aggfunc``. - - .. versionadded:: 3.0.0 - Returns ------- DataFrame @@ -9476,9 +9529,8 @@ def pivot_table( margins: bool = False, dropna: bool = True, margins_name: Level = "All", - observed: bool = True, + observed: bool | lib.NoDefault = lib.no_default, sort: bool = True, - **kwargs, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table @@ -9494,7 +9546,6 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, - **kwargs, ) def stack( @@ -9502,7 +9553,7 @@ def stack( level: IndexLabel = -1, dropna: bool | lib.NoDefault = lib.no_default, sort: bool | lib.NoDefault = lib.no_default, - future_stack: bool = True, + future_stack: bool = False, ): """ Stack the prescribed level(s) from columns to index. @@ -9512,9 +9563,10 @@ def stack( DataFrame. The new inner-most levels are created by pivoting the columns of the current dataframe: - - if the columns have a single level, the output is a Series; - - if the columns have multiple levels, the new index level(s) is (are) - taken from the prescribed level(s) and the output is a DataFrame. + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index + level(s) is (are) taken from the prescribed level(s) and + the output is a DataFrame. Parameters ---------- @@ -9530,7 +9582,7 @@ def stack( section. sort : bool, default True Whether to sort the levels of the resulting MultiIndex. - future_stack : bool, default True + future_stack : bool, default False Whether to use the new implementation that will replace the current implementation in pandas 3.0. When True, dropna and sort have no impact on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release @@ -9564,9 +9616,9 @@ def stack( -------- **Single level columns** - >>> df_single_level_cols = pd.DataFrame( - ... [[0, 1], [2, 3]], index=["cat", "dog"], columns=["weight", "height"] - ... ) + >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], + ... index=['cat', 'dog'], + ... columns=['weight', 'height']) Stacking a dataframe with a single level column axis returns a Series: @@ -9574,7 +9626,7 @@ def stack( weight height cat 0 1 dog 2 3 - >>> df_single_level_cols.stack() + >>> df_single_level_cols.stack(future_stack=True) cat weight 0 height 1 dog weight 2 @@ -9583,12 +9635,11 @@ def stack( **Multi level columns: simple case** - >>> multicol1 = pd.MultiIndex.from_tuples( - ... [("weight", "kg"), ("weight", "pounds")] - ... ) - >>> df_multi_level_cols1 = pd.DataFrame( - ... [[1, 2], [2, 4]], index=["cat", "dog"], columns=multicol1 - ... ) + >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('weight', 'pounds')]) + >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], + ... index=['cat', 'dog'], + ... columns=multicol1) Stacking a dataframe with a multi-level column axis: @@ -9597,7 +9648,7 @@ def stack( kg pounds cat 1 2 dog 2 4 - >>> df_multi_level_cols1.stack() + >>> df_multi_level_cols1.stack(future_stack=True) weight cat kg 1 pounds 2 @@ -9606,10 +9657,11 @@ def stack( **Missing values** - >>> multicol2 = pd.MultiIndex.from_tuples([("weight", "kg"), ("height", "m")]) - >>> df_multi_level_cols2 = pd.DataFrame( - ... [[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=multicol2 - ... ) + >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('height', 'm')]) + >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], + ... index=['cat', 'dog'], + ... columns=multicol2) It is common to have missing values when stacking a dataframe with multi-level columns, as the stacked dataframe typically @@ -9621,7 +9673,7 @@ def stack( kg m cat 1.0 2.0 dog 3.0 4.0 - >>> df_multi_level_cols2.stack() + >>> df_multi_level_cols2.stack(future_stack=True) weight height cat kg 1.0 NaN m NaN 2.0 @@ -9632,13 +9684,13 @@ def stack( The first parameter controls which level or levels are stacked: - >>> df_multi_level_cols2.stack(0) + >>> df_multi_level_cols2.stack(0, future_stack=True) kg m cat weight 1.0 NaN height NaN 2.0 dog weight 3.0 NaN height NaN 4.0 - >>> df_multi_level_cols2.stack([0, 1]) + >>> df_multi_level_cols2.stack([0, 1], future_stack=True) cat weight kg 1.0 height m 2.0 dog weight kg 3.0 @@ -9651,14 +9703,19 @@ def stack( stack_multiple, ) - warnings.warn( - "The previous implementation of stack is deprecated and will be " - "removed in a future version of pandas. See the What's New notes " - "for pandas 2.1.0 for details. Do not specify the future_stack " - "argument to adopt the new implementation and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + if ( + dropna is not lib.no_default + or sort is not lib.no_default + or self.columns.nlevels > 1 + ): + warnings.warn( + "The previous implementation of stack is deprecated and will be " + "removed in a future version of pandas. See the What's New notes " + "for pandas 2.1.0 for details. Specify future_stack=True to adopt " + "the new implementation and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) if dropna is lib.no_default: dropna = True @@ -9674,14 +9731,14 @@ def stack( if dropna is not lib.no_default: raise ValueError( - "dropna must be unspecified as the new " + "dropna must be unspecified with future_stack=True as the new " "implementation does not introduce rows of NA values. This " "argument will be removed in a future version of pandas." ) if sort is not lib.no_default: raise ValueError( - "Cannot specify sort, this argument will be " + "Cannot specify sort with future_stack=True, this argument will be " "removed in a future version of pandas. Sort the result using " ".sort_index instead." ) @@ -9758,13 +9815,9 @@ def explode( Examples -------- - >>> df = pd.DataFrame( - ... { - ... "A": [[0, 1, 2], "foo", [], [3, 4]], - ... "B": 1, - ... "C": [["a", "b", "c"], np.nan, [], ["d", "e"]], - ... } - ... ) + >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], + ... 'B': 1, + ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) >>> df A B C 0 [0, 1, 2] 1 [a, b, c] @@ -9774,7 +9827,7 @@ def explode( Single-column explode. - >>> df.explode("A") + >>> df.explode('A') A B C 0 0 1 [a, b, c] 0 1 1 [a, b, c] @@ -9786,7 +9839,7 @@ def explode( Multi-column explode. - >>> df.explode(list("AC")) + >>> df.explode(list('AC')) A B C 0 0 1 a 0 1 1 b @@ -9831,13 +9884,11 @@ def explode( result.index = default_index(len(result)) else: result.index = self.index.take(result.index) - result = result.reindex(columns=self.columns) + result = result.reindex(columns=self.columns, copy=False) return result.__finalize__(self, method="explode") - def unstack( - self, level: IndexLabel = -1, fill_value=None, sort: bool = True - ) -> DataFrame | Series: + def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): """ Pivot a level of the (necessarily hierarchical) index labels. @@ -9859,8 +9910,6 @@ def unstack( Returns ------- Series or DataFrame - If index is a MultiIndex: DataFrame with pivoted index labels as new - inner-most level column labels, else Series. See Also -------- @@ -9874,9 +9923,8 @@ def unstack( Examples -------- - >>> index = pd.MultiIndex.from_tuples( - ... [("one", "a"), ("one", "b"), ("two", "a"), ("two", "b")] - ... ) + >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), + ... ('two', 'a'), ('two', 'b')]) >>> s = pd.Series(np.arange(1.0, 5.0), index=index) >>> s one a 1.0 @@ -9909,6 +9957,7 @@ def unstack( return result.__finalize__(self, method="unstack") + @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) def melt( self, id_vars=None, @@ -9918,127 +9967,6 @@ def melt( col_level: Level | None = None, ignore_index: bool = True, ) -> DataFrame: - """ - Unpivot DataFrame from wide to long format, optionally leaving identifiers set. - - This function is useful to massage a DataFrame into a format where one - or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to - the row axis, leaving just two non-identifier columns, 'variable' and - 'value'. - - Parameters - ---------- - id_vars : scalar, tuple, list, or ndarray, optional - Column(s) to use as identifier variables. - value_vars : scalar, tuple, list, or ndarray, optional - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. - var_name : scalar, default None - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. - value_name : scalar, default 'value' - Name to use for the 'value' column, can't be an existing column label. - col_level : scalar, optional - If columns are a MultiIndex then use this level to melt. - ignore_index : bool, default True - If True, original index is ignored. If False, original index is retained. - Index labels will be repeated as necessary. - - Returns - ------- - DataFrame - Unpivoted DataFrame. - - See Also - -------- - melt : Identical method. - pivot_table : Create a spreadsheet-style pivot table as a DataFrame. - DataFrame.pivot : Return reshaped DataFrame organized - by given index / column values. - DataFrame.explode : Explode a DataFrame from list-like - columns to long format. - - Notes - ----- - Reference :ref:`the user guide ` for more examples. - - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "A": {0: "a", 1: "b", 2: "c"}, - ... "B": {0: 1, 1: 3, 2: 5}, - ... "C": {0: 2, 1: 4, 2: 6}, - ... } - ... ) - >>> df - A B C - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> df.melt(id_vars=["A"], value_vars=["B"]) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> df.melt(id_vars=["A"], value_vars=["B", "C"]) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 3 a C 2 - 4 b C 4 - 5 c C 6 - - The names of 'variable' and 'value' columns can be customized: - - >>> df.melt( - ... id_vars=["A"], - ... value_vars=["B"], - ... var_name="myVarname", - ... value_name="myValname", - ... ) - A myVarname myValname - 0 a B 1 - 1 b B 3 - 2 c B 5 - - Original index values can be kept around: - - >>> df.melt(id_vars=["A"], value_vars=["B", "C"], ignore_index=False) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 0 a C 2 - 1 b C 4 - 2 c C 6 - - If you have multi-index columns: - - >>> df.columns = [list("ABC"), list("DEF")] - >>> df - A B C - D E F - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> df.melt(col_level=0, id_vars=["A"], value_vars=["B"]) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> df.melt(id_vars=[("A", "D")], value_vars=[("B", "E")]) - (A, D) variable_0 variable_1 value - 0 a B E 1 - 1 b B E 3 - 2 c B E 5 - """ return melt( self, id_vars=id_vars, @@ -10179,11 +10107,11 @@ def _gotitem( -------- DataFrame.apply : Perform any type of operations. DataFrame.transform : Perform transformation type operations. - DataFrame.groupby : Perform operations over groups. - DataFrame.resample : Perform operations over resampled bins. - DataFrame.rolling : Perform operations over rolling window. - DataFrame.expanding : Perform operations over expanding window. - core.window.ewm.ExponentialMovingWindow : Perform operation over exponential + pandas.DataFrame.groupby : Perform operations over groups. + pandas.DataFrame.resample : Perform operations over resampled bins. + pandas.DataFrame.rolling : Perform operations over rolling window. + pandas.DataFrame.expanding : Perform operations over expanding window. + pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential weighted window. """ ) @@ -10275,7 +10203,7 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", - engine: Callable | None | Literal["python", "numba"] = None, + engine: Literal["python", "numba"] = "python", engine_kwargs: dict[str, bool] | None = None, **kwargs, ): @@ -10286,9 +10214,7 @@ def apply( either the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``). By default (``result_type=None``), the final return type is inferred from the return type of the applied function. Otherwise, - it depends on the `result_type` argument. The return type of the applied - function is inferred based on the first computed result obtained after - applying the function to a Series object. + it depends on the `result_type` argument. Parameters ---------- @@ -10339,24 +10265,28 @@ def apply( .. versionadded:: 2.1.0 - engine : decorator or {'python', 'numba'}, optional - Choose the execution engine to use. If not provided the function - will be executed by the regular Python interpreter. + engine : {'python', 'numba'}, default 'python' + Choose between the python (default) engine or the numba engine in apply. + + The numba engine will attempt to JIT compile the passed function, + which may result in speedups for large DataFrames. + It also supports the following engine_kwargs : - Other options include JIT compilers such Numba and Bodo, which in some - cases can speed up the execution. To use an executor you can provide - the decorators ``numba.jit``, ``numba.njit`` or ``bodo.jit``. You can - also provide the decorator with parameters, like ``numba.jit(nogit=True)``. + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) - Not all functions can be executed with all execution engines. In general, - JIT compilers will require type stability in the function (no variable - should change data type during the execution). And not all pandas and - NumPy APIs are supported. Check the engine documentation [1]_ and [2]_ - for limitations. + Note: Due to limitations within numba/how pandas interfaces with numba, + you should only use this if raw=True - .. warning:: + Note: The numba compiler only supports a subset of + valid Python/numpy operations. - String parameters will stop being supported in a future pandas version. + Please read more about the `supported python features + `_ + and `supported numpy features + `_ + in numba to learn what you can or cannot use in the passed function. .. versionadded:: 2.2.0 @@ -10364,7 +10294,6 @@ def apply( Pass keyword arguments to the engine. This is currently only used by the numba engine, see the documentation for the engine argument for more information. - **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -10387,16 +10316,9 @@ def apply( behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. - References - ---------- - .. [1] `Numba documentation - `_ - .. [2] `Bodo documentation - `/ - Examples -------- - >>> df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) + >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) >>> df A B 0 4 9 @@ -10436,7 +10358,7 @@ def apply( Passing ``result_type='expand'`` will expand list-like results to columns of a Dataframe - >>> df.apply(lambda x: [1, 2], axis=1, result_type="expand") + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') 0 1 0 1 2 1 1 2 @@ -10446,7 +10368,7 @@ def apply( ``result_type='expand'``. The resulting column names will be the Series index. - >>> df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1) + >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) foo bar 0 1 2 1 1 2 @@ -10457,107 +10379,30 @@ def apply( and broadcast it along the axis. The resulting column names will be the originals. - >>> df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") + >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') A B 0 1 2 1 1 2 2 1 2 - - Advanced users can speed up their code by using a Just-in-time (JIT) compiler - with ``apply``. The main JIT compilers available for pandas are Numba and Bodo. - In general, JIT compilation is only possible when the function passed to - ``apply`` has type stability (variables in the function do not change their - type during the execution). - - >>> import bodo - >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit) - - Note that JIT compilation is only recommended for functions that take a - significant amount of time to run. Fast functions are unlikely to run faster - with JIT compilation. """ - if engine is None or isinstance(engine, str): - from pandas.core.apply import frame_apply - - if engine is None: - engine = "python" - - if engine not in ["python", "numba"]: - raise ValueError(f"Unknown engine '{engine}'") - - op = frame_apply( - self, - func=func, - axis=axis, - raw=raw, - result_type=result_type, - by_row=by_row, - engine=engine, - engine_kwargs=engine_kwargs, - args=args, - kwargs=kwargs, - ) - return op.apply().__finalize__(self, method="apply") - elif hasattr(engine, "__pandas_udf__"): - if result_type is not None: - raise NotImplementedError( - f"{result_type=} only implemented for the default engine" - ) - - agg_axis = self._get_agg_axis(self._get_axis_number(axis)) + from pandas.core.apply import frame_apply - # one axis is empty - if not all(self.shape): - func = cast(Callable, func) - try: - if axis == 0: - r = func(Series([], dtype=np.float64), *args, **kwargs) - else: - r = func( - Series(index=self.columns, dtype=np.float64), - *args, - **kwargs, - ) - except Exception: - pass - else: - if not isinstance(r, Series): - if len(agg_axis): - r = func(Series([], dtype=np.float64), *args, **kwargs) - else: - r = np.nan - - return self._constructor_sliced(r, index=agg_axis) - return self.copy() - - data: DataFrame | np.ndarray = self - if raw: - # This will upcast the whole DataFrame to the same type, - # and likely result in an object 2D array. - # We should probably pass a list of 1D arrays instead, at - # lest for ``axis=0`` - data = self.values - result = engine.__pandas_udf__.apply( - data=data, - func=func, - args=args, - kwargs=kwargs, - decorator=engine, - axis=axis, - ) - if raw: - if result.ndim == 2: - return self._constructor( - result, index=self.index, columns=self.columns - ) - else: - return self._constructor_sliced(result, index=agg_axis) - return result - else: - raise ValueError(f"Unknown engine {engine}") + op = frame_apply( + self, + func=func, + axis=axis, + raw=raw, + result_type=result_type, + by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + return op.apply().__finalize__(self, method="apply") def map( - self, func: PythonFuncType, na_action: Literal["ignore"] | None = None, **kwargs + self, func: PythonFuncType, na_action: str | None = None, **kwargs ) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -10607,7 +10452,7 @@ def map( >>> df_copy = df.copy() >>> df_copy.iloc[0, 0] = pd.NA - >>> df_copy.map(lambda x: len(str(x)), na_action="ignore") + >>> df_copy.map(lambda x: len(str(x)), na_action='ignore') 0 1 0 NaN 4 1 5.0 5 @@ -10630,13 +10475,15 @@ def map( But it's better to avoid map in that case. - >>> df**2 + >>> df ** 2 0 1 0 1.000000 4.494400 1 11.262736 20.857489 """ if na_action not in {"ignore", None}: - raise ValueError(f"na_action must be 'ignore' or None. Got {na_action!r}") + raise ValueError( + f"na_action must be 'ignore' or None. Got {repr(na_action)}" + ) if self.empty: return self.copy() @@ -10648,6 +10495,60 @@ def infer(x): return self.apply(infer).__finalize__(self, "map") + def applymap( + self, func: PythonFuncType, na_action: NaAction | None = None, **kwargs + ) -> DataFrame: + """ + Apply a function to a Dataframe elementwise. + + .. deprecated:: 2.1.0 + + DataFrame.applymap has been deprecated. Use DataFrame.map instead. + + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to func. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. + + Returns + ------- + DataFrame + Transformed DataFrame. + + See Also + -------- + DataFrame.apply : Apply a function along input axis of DataFrame. + DataFrame.map : Apply a function along input axis of DataFrame. + DataFrame.replace: Replace values given in `to_replace` with `value`. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) + >>> df + 0 1 + 0 1.000 2.120 + 1 3.356 4.567 + + >>> df.map(lambda x: len(str(x))) + 0 1 + 0 3 4 + 1 5 5 + """ + warnings.warn( + "DataFrame.applymap has been deprecated. Use DataFrame.map instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.map(func, na_action=na_action, **kwargs) + # ---------------------------------------------------------------------- # Merging / joining methods @@ -10671,16 +10572,16 @@ def _append( index = Index( [other.name], - name=( - self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name - ), + name=self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name, ) row_df = other.to_frame().T # infer_objects is needed for # test_append_empty_frame_to_series_with_dateutil_tz - other = row_df.infer_objects().rename_axis(index.names) + other = row_df.infer_objects(copy=False).rename_axis( + index.names, copy=False + ) elif isinstance(other, list): if not other: pass @@ -10733,8 +10634,7 @@ def join( values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. - how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, - default 'left' + how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left' How to handle the operation of the two objects. * left: use calling frame's index (or column if on is specified) @@ -10746,10 +10646,6 @@ def join( of the calling's one. * cross: creates the cartesian product from both frames, preserves the order of the left keys. - * left_anti: use set difference of calling frame's index and `other`'s - index. - * right_anti: use set difference of `other`'s index and calling frame's - index. lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' @@ -10784,12 +10680,8 @@ def join( Examples -------- - >>> df = pd.DataFrame( - ... { - ... "key": ["K0", "K1", "K2", "K3", "K4", "K5"], - ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], - ... } - ... ) + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) >>> df key A @@ -10800,7 +10692,8 @@ def join( 4 K4 A4 5 K5 A5 - >>> other = pd.DataFrame({"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}) + >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], + ... 'B': ['B0', 'B1', 'B2']}) >>> other key B @@ -10810,7 +10703,7 @@ def join( Join DataFrames using their indexes. - >>> df.join(other, lsuffix="_caller", rsuffix="_other") + >>> df.join(other, lsuffix='_caller', rsuffix='_other') key_caller A key_other B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 @@ -10823,7 +10716,7 @@ def join( the index in both `df` and `other`. The joined DataFrame will have key as its index. - >>> df.set_index("key").join(other.set_index("key")) + >>> df.set_index('key').join(other.set_index('key')) A B key K0 A0 B0 @@ -10838,7 +10731,7 @@ def join( any column in `df`. This method preserves the original DataFrame's index in the result. - >>> df.join(other.set_index("key"), on="key") + >>> df.join(other.set_index('key'), on='key') key A B 0 K0 A0 B0 1 K1 A1 B1 @@ -10849,12 +10742,8 @@ def join( Using non-unique key values shows how they are matched. - >>> df = pd.DataFrame( - ... { - ... "key": ["K0", "K1", "K1", "K3", "K0", "K1"], - ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], - ... } - ... ) + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) >>> df key A @@ -10865,7 +10754,7 @@ def join( 4 K0 A4 5 K1 A5 - >>> df.join(other.set_index("key"), on="key", validate="m:1") + >>> df.join(other.set_index('key'), on='key', validate='m:1') key A B 0 K0 A0 B0 1 K1 A1 B1 @@ -10929,7 +10818,7 @@ def join( res = concat( frames, axis=1, join="outer", verify_integrity=True, sort=sort ) - return res.reindex(self.index) + return res.reindex(self.index, copy=False) else: return concat( frames, axis=1, join=how, verify_integrity=True, sort=sort @@ -10962,12 +10851,10 @@ def merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), - copy: bool | lib.NoDefault = lib.no_default, + copy: bool | None = None, indicator: str | bool = False, validate: MergeValidate | None = None, ) -> DataFrame: - self._check_copy_deprecation(copy) - from pandas.core.reshape.merge import merge return merge( @@ -10981,6 +10868,7 @@ def merge( right_index=right_index, sort=sort, suffixes=suffixes, + copy=copy, indicator=indicator, validate=validate, ) @@ -10989,7 +10877,7 @@ def round( self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs ) -> DataFrame: """ - Round numeric columns in a DataFrame to a variable number of decimal places. + Round a DataFrame to a variable number of decimal places. Parameters ---------- @@ -11020,18 +10908,10 @@ def round( numpy.around : Round a numpy array to the given number of decimals. Series.round : Round a Series to the given number of decimals. - Notes - ----- - For values exactly halfway between rounded decimal values, pandas rounds - to the nearest even value (e.g. -0.5 and 0.5 round to 0.0, 1.5 and 2.5 - round to 2.0, etc.). - Examples -------- - >>> df = pd.DataFrame( - ... [(0.21, 0.32), (0.01, 0.67), (0.66, 0.03), (0.21, 0.18)], - ... columns=["dogs", "cats"], - ... ) + >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats']) >>> df dogs cats 0 0.21 0.32 @@ -11053,7 +10933,7 @@ def round( specified with the column names as key and the number of decimal places as value - >>> df.round({"dogs": 1, "cats": 0}) + >>> df.round({'dogs': 1, 'cats': 0}) dogs cats 0 0.2 0.0 1 0.0 1.0 @@ -11064,7 +10944,7 @@ def round( specified with the column names as index and the number of decimal places as value - >>> decimals = pd.Series([0, 1], index=["cats", "dogs"]) + >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) >>> df.round(decimals) dogs cats 0 0.2 0.0 @@ -11074,7 +10954,7 @@ def round( """ from pandas.core.reshape.concat import concat - def _dict_round(df: DataFrame, decimals) -> Iterator[Series]: + def _dict_round(df: DataFrame, decimals): for col, vals in df.items(): try: yield _series_round(vals, decimals[col]) @@ -11102,6 +10982,7 @@ def _series_round(ser: Series, decimals: int) -> Series: # type "Union[int, integer[Any]]"; expected "int" new_mgr = self._mgr.round( decimals=decimals, # type: ignore[arg-type] + using_cow=using_copy_on_write(), ) return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" @@ -11176,18 +11057,15 @@ def corr( >>> def histogram_intersection(a, b): ... v = np.minimum(a, b).sum().round(decimals=1) ... return v - >>> df = pd.DataFrame( - ... [(0.2, 0.3), (0.0, 0.6), (0.6, 0.0), (0.2, 0.1)], - ... columns=["dogs", "cats"], - ... ) + >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], + ... columns=['dogs', 'cats']) >>> df.corr(method=histogram_intersection) dogs cats dogs 1.0 0.3 cats 0.3 1.0 - >>> df = pd.DataFrame( - ... [(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], columns=["dogs", "cats"] - ... ) + >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], + ... columns=['dogs', 'cats']) >>> df.corr(min_periods=3) dogs cats dogs 1.0 NaN @@ -11313,18 +11191,16 @@ def cov( Examples -------- - >>> df = pd.DataFrame( - ... [(1, 2), (0, 3), (2, 0), (1, 1)], columns=["dogs", "cats"] - ... ) + >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) >>> df.cov() dogs cats dogs 0.666667 -1.000000 cats -1.000000 1.666667 >>> np.random.seed(42) - >>> df = pd.DataFrame( - ... np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"] - ... ) + >>> df = pd.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) >>> df.cov() a b c d e a 0.998438 -0.020161 0.059277 -0.008943 0.014144 @@ -11340,9 +11216,10 @@ def cov( each column pair in order to have a valid result: >>> np.random.seed(42) - >>> df = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) - >>> df.loc[df.index[:5], "a"] = np.nan - >>> df.loc[df.index[5:10], "b"] = np.nan + >>> df = pd.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> df.loc[df.index[:5], 'a'] = np.nan + >>> df.loc[df.index[5:10], 'b'] = np.nan >>> df.cov(min_periods=12) a b c a 0.316741 NaN -0.150812 @@ -11374,7 +11251,6 @@ def corrwith( drop: bool = False, method: CorrelationMethod = "pearson", numeric_only: bool = False, - min_periods: int | None = None, ) -> Series: """ Compute pairwise correlation. @@ -11405,9 +11281,6 @@ def corrwith( numeric_only : bool, default False Include only `float`, `int` or `boolean` data. - min_periods : int, optional - Minimum number of observations needed to have a valid result. - .. versionadded:: 1.5.0 .. versionchanged:: 2.0.0 @@ -11426,12 +11299,8 @@ def corrwith( -------- >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] - >>> df1 = pd.DataFrame( - ... np.arange(20).reshape(5, 4), index=index, columns=columns - ... ) - >>> df2 = pd.DataFrame( - ... np.arange(16).reshape(4, 4), index=index[:4], columns=columns - ... ) + >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) + >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) >>> df1.corrwith(df2) one 1.0 two 1.0 @@ -11446,19 +11315,16 @@ def corrwith( d 1.0 e NaN dtype: float64 - """ + """ # noqa: E501 axis = self._get_axis_number(axis) this = self._get_numeric_data() if numeric_only else self if isinstance(other, Series): - return this.apply( - lambda x: other.corr(x, method=method, min_periods=min_periods), - axis=axis, - ) + return this.apply(lambda x: other.corr(x, method=method), axis=axis) if numeric_only: other = other._get_numeric_data() - left, right = this.align(other, join="inner") + left, right = this.align(other, join="inner", copy=False) if axis == 1: left = left.T @@ -11517,7 +11383,7 @@ def c(x): # ---------------------------------------------------------------------- # ndarray-like stats methods - def count(self, axis: Axis = 0, numeric_only: bool = False) -> Series: + def count(self, axis: Axis = 0, numeric_only: bool = False): """ Count non-NA cells for each column or row. @@ -11549,13 +11415,10 @@ def count(self, axis: Axis = 0, numeric_only: bool = False) -> Series: -------- Constructing DataFrame from a dictionary: - >>> df = pd.DataFrame( - ... { - ... "Person": ["John", "Myla", "Lewis", "John", "Myla"], - ... "Age": [24.0, np.nan, 21.0, 33, 26], - ... "Single": [False, True, True, True, False], - ... } - ... ) + >>> df = pd.DataFrame({"Person": + ... ["John", "Myla", "Lewis", "John", "Myla"], + ... "Age": [24., np.nan, 21., 33, 26], + ... "Single": [False, True, True, True, False]}) >>> df Person Age Single 0 John 24.0 False @@ -11574,7 +11437,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False) -> Series: Counts for each **row**: - >>> df.count(axis="columns") + >>> df.count(axis='columns') 0 3 1 2 2 3 @@ -11595,7 +11458,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False) -> Series: else: result = notna(frame).sum(axis=axis) - return result.astype("int64").__finalize__(self, method="count") + return result.astype("int64", copy=False).__finalize__(self, method="count") def _reduce( self, @@ -11618,11 +11481,30 @@ def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) + dtype_has_keepdims: dict[ExtensionDtype, bool] = {} + def blk_func(values, axis: Axis = 1): if isinstance(values, ExtensionArray): - if not is_1d_only_ea_dtype(values.dtype): + if not is_1d_only_ea_dtype(values.dtype) and not isinstance( + self._mgr, ArrayManager + ): return values._reduce(name, axis=1, skipna=skipna, **kwds) - return values._reduce(name, skipna=skipna, keepdims=True, **kwds) + has_keepdims = dtype_has_keepdims.get(values.dtype) + if has_keepdims is None: + sign = signature(values._reduce) + has_keepdims = "keepdims" in sign.parameters + dtype_has_keepdims[values.dtype] = has_keepdims + if has_keepdims: + return values._reduce(name, skipna=skipna, keepdims=True, **kwds) + else: + warnings.warn( + f"{type(values)}._reduce will require a `keepdims` parameter " + "in the future", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = values._reduce(name, skipna=skipna, **kwds) + return np.array([result]) else: return op(values, axis=axis, skipna=skipna, **kwds) @@ -11640,9 +11522,9 @@ def _get_data() -> DataFrame: if numeric_only: df = _get_data() if axis is None: - dtype = find_common_type([block.values.dtype for block in df._mgr.blocks]) + dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) if isinstance(dtype, ExtensionDtype): - df = df.astype(dtype) + df = df.astype(dtype, copy=False) arr = concat_compat(list(df._iter_column_arrays())) return arr._reduce(name, skipna=skipna, keepdims=False, **kwds) return func(df.values) @@ -11665,9 +11547,7 @@ def _get_data() -> DataFrame: # kurtosis excluded since groupby does not implement it if df.shape[1] and name != "kurt": - dtype = find_common_type( - [block.values.dtype for block in df._mgr.blocks] - ) + dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) if isinstance(dtype, ExtensionDtype): # GH 54341: fastpath for EA-backed axis=1 reductions # This flattens the frame into a single 1D array while keeping @@ -11676,7 +11556,7 @@ def _get_data() -> DataFrame: # be equivalent to transposing the original frame and aggregating # with axis=0. name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) - df = df.astype(dtype) + df = df.astype(dtype, copy=False) arr = concat_compat(list(df._iter_column_arrays())) nrows, ncols = df.shape row_index = np.tile(np.arange(nrows), ncols) @@ -11741,46 +11621,16 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: else: raise NotImplementedError(name) - for blocks in self._mgr.blocks: - middle = func(blocks.values, axis=0, skipna=skipna) + for arr in self._mgr.arrays: + middle = func(arr, axis=0, skipna=skipna) result = ufunc(result, middle) res_ser = self._constructor_sliced(result, index=self.index, copy=False) return res_ser + @doc(make_doc("any", ndim=2)) # error: Signature of "any" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def any( - self, - *, - axis: Axis = ..., - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def any( - self, - *, - axis: None, - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> bool: ... - - @overload - def any( - self, - *, - axis: Axis | None, - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> Series | bool: ... - - @doc(make_doc("any", ndim=1)) - def any( + def any( # type: ignore[override] self, *, axis: Axis | None = 0, @@ -11795,38 +11645,7 @@ def any( result = result.__finalize__(self, method="any") return result - @overload - def all( - self, - *, - axis: Axis = ..., - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def all( - self, - *, - axis: None, - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> bool: ... - - @overload - def all( - self, - *, - axis: Axis | None, - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> Series | bool: ... - - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") - @doc(make_doc("all", ndim=1)) + @doc(make_doc("all", ndim=2)) def all( self, axis: Axis | None = 0, @@ -11841,101 +11660,33 @@ def all( result = result.__finalize__(self, method="all") return result - # error: Signature of "min" incompatible with supertype "NDFrame" - @overload # type: ignore[override] + @doc(make_doc("min", ndim=2)) def min( self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, **kwargs, - ) -> Series: ... + ): + result = super().min(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="min") + return result - @overload - def min( + @doc(make_doc("max", ndim=2)) + def max( self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, **kwargs, - ) -> Any: ... - - @overload - def min( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... - - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") - @doc(make_doc("min", ndim=2)) - def min( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - result = super().min( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="min") - return result - - # error: Signature of "max" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def max( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def max( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... - - @overload - def max( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... - - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") - @doc(make_doc("max", ndim=2)) - def max( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - result = super().max( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) + ): + result = super().max(axis, skipna, numeric_only, **kwargs) if isinstance(result, Series): result = result.__finalize__(self, method="max") return result - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") + @doc(make_doc("sum", ndim=2)) def sum( self, axis: Axis | None = 0, @@ -11943,100 +11694,11 @@ def sum( numeric_only: bool = False, min_count: int = 0, **kwargs, - ) -> Series: - """ - Return the sum of the values over the requested axis. - - This is equivalent to the method ``numpy.sum``. - - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - For `Series` this parameter is unused and defaults to 0. - - .. warning:: - - The behavior of DataFrame.sum with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). - - .. versionadded:: 2.0.0 - - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer than - ``min_count`` non-NA values are present the result will be NA. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - Series or scalar - Sum over requested axis. - - See Also - -------- - Series.sum : Return the sum over Series values. - DataFrame.mean : Return the mean of the values over the requested axis. - DataFrame.median : Return the median of the values over the requested axis. - DataFrame.mode : Get the mode(s) of each element along the requested axis. - DataFrame.std : Return the standard deviation of the values over the - requested axis. - - Examples - -------- - >>> idx = pd.MultiIndex.from_arrays( - ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], - ... names=["blooded", "animal"], - ... ) - >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) - >>> s - blooded animal - warm dog 4 - falcon 2 - cold fish 0 - spider 8 - Name: legs, dtype: int64 - - >>> s.sum() - 14 - - By default, the sum of an empty or all-NA Series is ``0``. - - >>> pd.Series([], dtype="float64").sum() # min_count=0 is the default - 0.0 - - This can be controlled with the ``min_count`` parameter. For example, if - you'd like the sum of an empty series to be NaN, pass ``min_count=1``. - - >>> pd.Series([], dtype="float64").sum(min_count=1) - nan - - Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and - empty series identically. - - >>> pd.Series([np.nan]).sum() - 0.0 - - >>> pd.Series([np.nan]).sum(min_count=1) - nan - """ - result = super().sum( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="sum") - return result + ): + result = super().sum(axis, skipna, numeric_only, min_count, **kwargs) + return result.__finalize__(self, method="sum") - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") + @doc(make_doc("prod", ndim=2)) def prod( self, axis: Axis | None = 0, @@ -12044,840 +11706,122 @@ def prod( numeric_only: bool = False, min_count: int = 0, **kwargs, - ) -> Series: - """ - Return the product of the values over the requested axis. - - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - For `Series` this parameter is unused and defaults to 0. - - .. warning:: - - The behavior of DataFrame.prod with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). - - .. versionadded:: 2.0.0 - - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer than - ``min_count`` non-NA values are present the result will be NA. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - Series or scalar - The product of the values over the requested axis. - - See Also - -------- - Series.sum : Return the sum. - Series.min : Return the minimum. - Series.max : Return the maximum. - Series.idxmin : Return the index of the minimum. - Series.idxmax : Return the index of the maximum. - DataFrame.sum : Return the sum over the requested axis. - DataFrame.min : Return the minimum over the requested axis. - DataFrame.max : Return the maximum over the requested axis. - DataFrame.idxmin : Return the index of the minimum over the requested axis. - DataFrame.idxmax : Return the index of the maximum over the requested axis. - - Examples - -------- - By default, the product of an empty or all-NA Series is ``1`` - - >>> pd.Series([], dtype="float64").prod() - 1.0 - - This can be controlled with the ``min_count`` parameter - - >>> pd.Series([], dtype="float64").prod(min_count=1) - nan - - Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and - empty series identically. - - >>> pd.Series([np.nan]).prod() - 1.0 - - >>> pd.Series([np.nan]).prod(min_count=1) - nan - """ - result = super().prod( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="prod") - return result - - # error: Signature of "mean" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def mean( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def mean( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... - - @overload - def mean( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + ): + result = super().prod(axis, skipna, numeric_only, min_count, **kwargs) + return result.__finalize__(self, method="prod") - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") @doc(make_doc("mean", ndim=2)) - def mean( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - result = super().mean( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="mean") - return result - - # error: Signature of "median" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def median( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def median( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... - - @overload - def median( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... - - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") - @doc(make_doc("median", ndim=2)) - def median( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - result = super().median( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="median") - return result - - # error: Signature of "sem" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def sem( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def sem( - self, - *, - axis: None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... - - @overload - def sem( - self, - *, - axis: Axis | None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... - - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") - def sem( - self, - axis: Axis | None = 0, - skipna: bool = True, - ddof: int = 1, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return unbiased standard error of the mean over requested axis. - - Normalized by N-1 by default. This can be changed using the ddof argument - - Parameters - ---------- - axis : {index (0), columns (1)} - For `Series` this parameter is unused and defaults to 0. - - .. warning:: - - The behavior of DataFrame.sem with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is N - ddof, - where N represents the number of elements. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - **kwargs : - Additional keywords passed. - - Returns - ------- - Series or DataFrame (if level specified) - Unbiased standard error of the mean over requested axis. - - See Also - -------- - DataFrame.var : Return unbiased variance over requested axis. - DataFrame.std : Returns sample standard deviation over requested axis. - - Examples - -------- - >>> s = pd.Series([1, 2, 3]) - >>> s.sem().round(6) - 0.57735 - - With a DataFrame - - >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"]) - >>> df - a b - tiger 1 2 - zebra 2 3 - >>> df.sem() - a 0.5 - b 0.5 - dtype: float64 - - Using axis=1 - - >>> df.sem(axis=1) - tiger 0.5 - zebra 0.5 - dtype: float64 - - In this case, `numeric_only` should be set to `True` - to avoid getting an error. - - >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"]) - >>> df.sem(numeric_only=True) - a 0.5 - dtype: float64 - """ - result = super().sem( - axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="sem") - return result - - # error: Signature of "var" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def var( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def var( - self, - *, - axis: None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... - - @overload - def var( - self, - *, - axis: Axis | None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... - - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") - def var( - self, - axis: Axis | None = 0, - skipna: bool = True, - ddof: int = 1, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return unbiased variance over requested axis. - - Normalized by N-1 by default. This can be changed using the ddof argument. - - Parameters - ---------- - axis : {index (0), columns (1)} - For `Series` this parameter is unused and defaults to 0. - - .. warning:: - - The behavior of DataFrame.var with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is N - ddof, - where N represents the number of elements. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - **kwargs : - Additional keywords passed. - - Returns - ------- - Series or scalaer - Unbiased variance over requested axis. - - See Also - -------- - numpy.var : Equivalent function in NumPy. - Series.var : Return unbiased variance over Series values. - Series.std : Return standard deviation over Series values. - DataFrame.std : Return standard deviation of the values over - the requested axis. - - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "person_id": [0, 1, 2, 3], - ... "age": [21, 25, 62, 43], - ... "height": [1.61, 1.87, 1.49, 2.01], - ... } - ... ).set_index("person_id") - >>> df - age height - person_id - 0 21 1.61 - 1 25 1.87 - 2 62 1.49 - 3 43 2.01 - - >>> df.var() - age 352.916667 - height 0.056367 - dtype: float64 - - Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1: - - >>> df.var(ddof=0) - age 264.687500 - height 0.042275 - dtype: float64 - """ - result = super().var( - axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="var") - return result - - # error: Signature of "std" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def std( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def std( - self, - *, - axis: None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... - - @overload - def std( - self, - *, - axis: Axis | None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... - - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") - def std( - self, - axis: Axis | None = 0, - skipna: bool = True, - ddof: int = 1, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return sample standard deviation over requested axis. - - Normalized by N-1 by default. This can be changed using the ddof argument. - - Parameters - ---------- - axis : {index (0), columns (1)} - For `Series` this parameter is unused and defaults to 0. - - .. warning:: - - The behavior of DataFrame.std with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is N - ddof, - where N represents the number of elements. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - **kwargs : dict - Additional keyword arguments to be passed to the function. - - Returns - ------- - Series or scalar - Standard deviation over requested axis. - - See Also - -------- - Series.std : Return standard deviation over Series values. - DataFrame.mean : Return the mean of the values over the requested axis. - DataFrame.median : Return the median of the values over the requested axis. - DataFrame.mode : Get the mode(s) of each element along the requested axis. - DataFrame.sum : Return the sum of the values over the requested axis. - - Notes - ----- - To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the - default `ddof=1`) - - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "person_id": [0, 1, 2, 3], - ... "age": [21, 25, 62, 43], - ... "height": [1.61, 1.87, 1.49, 2.01], - ... } - ... ).set_index("person_id") - >>> df - age height - person_id - 0 21 1.61 - 1 25 1.87 - 2 62 1.49 - 3 43 2.01 - - The standard deviation of the columns can be found as follows: - - >>> df.std() - age 18.786076 - height 0.237417 - dtype: float64 - - Alternatively, `ddof=0` can be set to normalize by N instead of N-1: - - >>> df.std(ddof=0) - age 16.269219 - height 0.205609 - dtype: float64 - """ - result = super().std( - axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="std") - return result - - # error: Signature of "skew" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def skew( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def skew( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... - - @overload - def skew( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... - - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") - def skew( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return unbiased skew over requested axis. - - Normalized by N-1. - - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - For `Series` this parameter is unused and defaults to 0. - - For DataFrames, specifying ``axis=None`` will apply the aggregation - across both axes. - - .. versionadded:: 2.0.0 - - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - Include only float, int, boolean columns. - - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - Series or scalar - Unbiased skew over requested axis. - - See Also - -------- - Dataframe.kurt : Returns unbiased kurtosis over requested axis. - - Examples - -------- - >>> s = pd.Series([1, 2, 3]) - >>> s.skew() - 0.0 - - With a DataFrame - - >>> df = pd.DataFrame( - ... {"a": [1, 2, 3], "b": [2, 3, 4], "c": [1, 3, 5]}, - ... index=["tiger", "zebra", "cow"], - ... ) - >>> df - a b c - tiger 1 2 1 - zebra 2 3 3 - cow 3 4 5 - >>> df.skew() - a 0.0 - b 0.0 - c 0.0 - dtype: float64 - - Using axis=1 - - >>> df.skew(axis=1) - tiger 1.732051 - zebra -1.732051 - cow 0.000000 - dtype: float64 - - In this case, `numeric_only` should be set to `True` to avoid - getting an error. - - >>> df = pd.DataFrame( - ... {"a": [1, 2, 3], "b": ["T", "Z", "X"]}, index=["tiger", "zebra", "cow"] - ... ) - >>> df.skew(numeric_only=True) - a 0.0 - dtype: float64 - """ - result = super().skew( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="skew") - return result - - # error: Signature of "kurt" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def kurt( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... - - @overload - def kurt( + def mean( self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, **kwargs, - ) -> Any: ... + ): + result = super().mean(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="mean") + return result - @overload - def kurt( + @doc(make_doc("median", ndim=2)) + def median( self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, **kwargs, - ) -> Series | Any: ... + ): + result = super().median(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="median") + return result - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") - def kurt( + @doc(make_doc("sem", ndim=2)) + def sem( self, axis: Axis | None = 0, skipna: bool = True, + ddof: int = 1, numeric_only: bool = False, **kwargs, - ) -> Series | Any: - """ - Return unbiased kurtosis over requested axis. - - Kurtosis obtained using Fisher's definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - For `Series` this parameter is unused and defaults to 0. - - For DataFrames, specifying ``axis=None`` will apply the aggregation - across both axes. - - .. versionadded:: 2.0.0 - - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - Include only float, int, boolean columns. - - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - Series or scalar - Unbiased kurtosis over requested axis. - - See Also - -------- - Dataframe.kurtosis : Returns unbiased kurtosis over requested axis. - - Examples - -------- - >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"]) - >>> s - cat 1 - dog 2 - dog 2 - mouse 3 - dtype: int64 - >>> s.kurt() - 1.5 - - With a DataFrame - - >>> df = pd.DataFrame( - ... {"a": [1, 2, 2, 3], "b": [3, 4, 4, 4]}, - ... index=["cat", "dog", "dog", "mouse"], - ... ) - >>> df - a b - cat 1 3 - dog 2 4 - dog 2 4 - mouse 3 4 - >>> df.kurt() - a 1.5 - b 4.0 - dtype: float64 - - With axis=None - - >>> df.kurt(axis=None).round(6) - -0.988693 - - Using axis=1 - - >>> df = pd.DataFrame( - ... {"a": [1, 2], "b": [3, 4], "c": [3, 4], "d": [1, 2]}, - ... index=["cat", "dog"], - ... ) - >>> df.kurt(axis=1) - cat -6.0 - dog -6.0 - dtype: float64 - """ - result = super().kurt( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) + ): + result = super().sem(axis, skipna, ddof, numeric_only, **kwargs) if isinstance(result, Series): - result = result.__finalize__(self, method="kurt") + result = result.__finalize__(self, method="sem") return result - # error: Incompatible types in assignment - kurtosis = kurt # type: ignore[assignment] - product = prod - - @doc(make_doc("cummin", ndim=2)) - def cummin( + @doc(make_doc("var", ndim=2)) + def var( self, - axis: Axis = 0, + axis: Axis | None = 0, skipna: bool = True, + ddof: int = 1, numeric_only: bool = False, - *args, **kwargs, - ) -> Self: - data = self._get_numeric_data() if numeric_only else self - return NDFrame.cummin(data, axis, skipna, *args, **kwargs) + ): + result = super().var(axis, skipna, ddof, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="var") + return result - @doc(make_doc("cummax", ndim=2)) - def cummax( + @doc(make_doc("std", ndim=2)) + def std( self, - axis: Axis = 0, + axis: Axis | None = 0, skipna: bool = True, + ddof: int = 1, numeric_only: bool = False, - *args, **kwargs, - ) -> Self: - data = self._get_numeric_data() if numeric_only else self - return NDFrame.cummax(data, axis, skipna, *args, **kwargs) + ): + result = super().std(axis, skipna, ddof, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="std") + return result - @doc(make_doc("cumsum", ndim=2)) - def cumsum( + @doc(make_doc("skew", ndim=2)) + def skew( self, - axis: Axis = 0, + axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, - *args, **kwargs, - ) -> Self: - data = self._get_numeric_data() if numeric_only else self - return NDFrame.cumsum(data, axis, skipna, *args, **kwargs) + ): + result = super().skew(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="skew") + return result - @doc(make_doc("cumprod", 2)) - def cumprod( + @doc(make_doc("kurt", ndim=2)) + def kurt( self, - axis: Axis = 0, + axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, - *args, **kwargs, - ) -> Self: - data = self._get_numeric_data() if numeric_only else self - return NDFrame.cumprod(data, axis, skipna, *args, **kwargs) + ): + result = super().kurt(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="kurt") + return result + + kurtosis = kurt + product = prod + + @doc(make_doc("cummin", ndim=2)) + def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cummin(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cummax", ndim=2)) + def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cummax(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cumsum", ndim=2)) + def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cumprod", 2)) + def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ @@ -12897,7 +11841,6 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: Returns ------- Series - Series with counts of unique values per row or column, depending on `axis`. See Also -------- @@ -12906,7 +11849,7 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: Examples -------- - >>> df = pd.DataFrame({"A": [4, 5, 6], "B": [4, 1, 1]}) + >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) >>> df.nunique() A 3 B 2 @@ -12920,80 +11863,10 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ return self.apply(Series.nunique, axis=axis, dropna=dropna) + @doc(_shared_docs["idxmin"], numeric_only_default="False") def idxmin( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: - """ - Return index of first occurrence of minimum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If the entire DataFrame is NA, - or if ``skipna=False`` and there is an NA value, this method - will raise a ``ValueError``. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of minima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmin : Return index of the minimum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmin``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame( - ... { - ... { - ... "consumption": [10.51, 103.11, 55.48], - ... "co2_emissions": [37.2, 19.66, 1712], - ... } - ... }, - ... index=["Pork", "Wheat Products", "Beef"], - ... ) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the minimum value in each column. - - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object - - To return the index for the minimum value in each row, use ``axis="columns"``. - - >>> df.idxmin(axis="columns") - Pork consumption - Wheat Products co2_emissions - Beef consumption - dtype: object - """ axis = self._get_axis_number(axis) if self.empty and len(self.axes[axis]): @@ -13027,80 +11900,10 @@ def idxmin( final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) return final_result.__finalize__(self, method="idxmin") + @doc(_shared_docs["idxmax"], numeric_only_default="False") def idxmax( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: - """ - Return index of first occurrence of maximum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If the entire DataFrame is NA, - or if ``skipna=False`` and there is an NA value, this method - will raise a ``ValueError``. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of maxima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmax : Return index of the maximum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmax``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame( - ... { - ... { - ... "consumption": [10.51, 103.11, 55.48], - ... "co2_emissions": [37.2, 19.66, 1712], - ... } - ... }, - ... index=["Pork", "Wheat Products", "Beef"], - ... ) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the maximum value in each column. - - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object - - To return the index for the maximum value in each row, use ``axis="columns"``. - - >>> df.idxmax(axis="columns") - Pork co2_emissions - Wheat Products consumption - Beef co2_emissions - dtype: object - """ axis = self._get_axis_number(axis) if self.empty and len(self.axes[axis]): @@ -13143,7 +11946,7 @@ def _get_agg_axis(self, axis_num: int) -> Index: elif axis_num == 1: return self.index else: - raise ValueError(f"Axis must be 0 or 1 (got {axis_num!r})") + raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") def mode( self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True @@ -13179,16 +11982,12 @@ def mode( Examples -------- - >>> df = pd.DataFrame( - ... [ - ... ("bird", 2, 2), - ... ("mammal", 4, np.nan), - ... ("arthropod", 8, 0), - ... ("bird", 2, np.nan), - ... ], - ... index=("falcon", "horse", "spider", "ostrich"), - ... columns=("species", "legs", "wings"), - ... ) + >>> df = pd.DataFrame([('bird', 2, 2), + ... ('mammal', 4, np.nan), + ... ('arthropod', 8, 0), + ... ('bird', 2, np.nan)], + ... index=('falcon', 'horse', 'spider', 'ostrich'), + ... columns=('species', 'legs', 'wings')) >>> df species legs wings falcon bird 2 2.0 @@ -13222,7 +12021,7 @@ def mode( To compute the mode over columns and not rows, use the axis parameter: - >>> df.mode(axis="columns", numeric_only=True) + >>> df.mode(axis='columns', numeric_only=True) 0 1 falcon 2.0 NaN horse 4.0 NaN @@ -13249,7 +12048,8 @@ def quantile( numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., method: Literal["single", "table"] = ..., - ) -> Series: ... + ) -> Series: + ... @overload def quantile( @@ -13259,7 +12059,8 @@ def quantile( numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., method: Literal["single", "table"] = ..., - ) -> Series | DataFrame: ... + ) -> Series | DataFrame: + ... @overload def quantile( @@ -13269,7 +12070,8 @@ def quantile( numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., method: Literal["single", "table"] = ..., - ) -> Series | DataFrame: ... + ) -> Series | DataFrame: + ... def quantile( self, @@ -13326,25 +12128,24 @@ def quantile( Examples -------- - >>> df = pd.DataFrame( - ... np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=["a", "b"] - ... ) - >>> df.quantile(0.1) + >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1) a 1.3 b 3.7 Name: 0.1, dtype: float64 - >>> df.quantile([0.1, 0.5]) + >>> df.quantile([.1, .5]) a b 0.1 1.3 3.7 0.5 2.5 55.0 Specifying `method='table'` will compute the quantile over all columns. - >>> df.quantile(0.1, method="table", interpolation="nearest") + >>> df.quantile(.1, method="table", interpolation="nearest") a 1 b 1 Name: 0.1, dtype: int64 - >>> df.quantile([0.1, 0.5], method="table", interpolation="nearest") + >>> df.quantile([.1, .5], method="table", interpolation="nearest") a b 0.1 1 1 0.5 3 100 @@ -13352,13 +12153,11 @@ def quantile( Specifying `numeric_only=False` will also compute the quantile of datetime and timedelta data. - >>> df = pd.DataFrame( - ... { - ... "A": [1, 2], - ... "B": [pd.Timestamp("2010"), pd.Timestamp("2011")], - ... "C": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], - ... } - ... ) + >>> df = pd.DataFrame({'A': [1, 2], + ... 'B': [pd.Timestamp('2010'), + ... pd.Timestamp('2011')], + ... 'C': [pd.Timedelta('1 days'), + ... pd.Timedelta('2 days')]}) >>> df.quantile(0.5, numeric_only=False) A 1.5 B 2010-07-02 12:00:00 @@ -13399,7 +12198,7 @@ def quantile( if len(data.columns) == 0: # GH#23925 _get_numeric_data may have dropped all columns - cols = self.columns[:0] + cols = Index([], name=self.columns.name) dtype = np.float64 if axis == 1: @@ -13454,12 +12253,10 @@ def to_timestamp( freq: Frequency | None = None, how: ToTimestampHow = "start", axis: Axis = 0, - copy: bool | lib.NoDefault = lib.no_default, + copy: bool | None = None, ) -> DataFrame: """ - Cast PeriodIndex to DatetimeIndex of timestamps, at *beginning* of period. - - This can be changed to the *end* of the period, by specifying `how="e"`. + Cast to DatetimeIndex of timestamps, at *beginning* of period. Parameters ---------- @@ -13470,7 +12267,7 @@ def to_timestamp( vs. end. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default). - copy : bool, default False + copy : bool, default True If False then underlying input data is not copied. .. note:: @@ -13485,22 +12282,15 @@ def to_timestamp( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` - .. deprecated:: 3.0.0 - Returns ------- - DataFrame with DatetimeIndex - DataFrame with the PeriodIndex cast to DatetimeIndex. - - See Also - -------- - DataFrame.to_period: Inverse method to cast DatetimeIndex to PeriodIndex. - Series.to_timestamp: Equivalent method for Series. + DataFrame + The DataFrame has a DatetimeIndex. Examples -------- - >>> idx = pd.PeriodIndex(["2023", "2024"], freq="Y") - >>> d = {"col1": [1, 2], "col2": [3, 4]} + >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y') + >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = pd.DataFrame(data=d, index=idx) >>> df1 col1 col2 @@ -13520,7 +12310,7 @@ def to_timestamp( Using `freq` which is the offset that the Timestamps will have >>> df2 = pd.DataFrame(data=d, index=idx) - >>> df2 = df2.to_timestamp(freq="M") + >>> df2 = df2.to_timestamp(freq='M') >>> df2 col1 col2 2023-01-31 1 3 @@ -13528,8 +12318,7 @@ def to_timestamp( >>> df2.index DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None) """ - self._check_copy_deprecation(copy) - new_obj = self.copy(deep=False) + new_obj = self.copy(deep=copy and not using_copy_on_write()) axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) @@ -13542,17 +12331,13 @@ def to_timestamp( return new_obj def to_period( - self, - freq: Frequency | None = None, - axis: Axis = 0, - copy: bool | lib.NoDefault = lib.no_default, + self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None ) -> DataFrame: """ Convert DataFrame from DatetimeIndex to PeriodIndex. Convert DataFrame from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed). Either index of columns can be - converted, depending on `axis` argument. + frequency (inferred from index if not passed). Parameters ---------- @@ -13560,7 +12345,7 @@ def to_period( Frequency of the PeriodIndex. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default). - copy : bool, default False + copy : bool, default True If False then underlying input data is not copied. .. note:: @@ -13575,17 +12360,10 @@ def to_period( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` - .. deprecated:: 3.0.0 - Returns ------- DataFrame - The DataFrame with the converted PeriodIndex. - - See Also - -------- - Series.to_period: Equivalent method for Series. - Series.dt.to_period: Convert DateTime column values. + The DataFrame has a PeriodIndex. Examples -------- @@ -13599,7 +12377,7 @@ def to_period( >>> idx DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[s]', freq=None) + dtype='datetime64[ns]', freq=None) >>> idx.to_period("M") PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') @@ -13609,8 +12387,7 @@ def to_period( >>> idx.to_period("Y") PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]') """ - self._check_copy_deprecation(copy) - new_obj = self.copy(deep=False) + new_obj = self.copy(deep=copy and not using_copy_on_write()) axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) @@ -13648,16 +12425,10 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: Series.str.contains: Test if pattern or regex is contained within a string of a Series or Index. - Notes - ----- - ``__iter__`` is used (and not ``__contains__``) to iterate over values - when checking if it contains the elements in DataFrame. - Examples -------- - >>> df = pd.DataFrame( - ... {"num_legs": [2, 4], "num_wings": [2, 0]}, index=["falcon", "dog"] - ... ) + >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, + ... index=['falcon', 'dog']) >>> df num_legs num_wings falcon 2 2 @@ -13681,7 +12452,7 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: When ``values`` is a dict, we can pass values to check for each column separately: - >>> df.isin({"num_wings": [0, 3]}) + >>> df.isin({'num_wings': [0, 3]}) num_legs num_wings falcon False False dog False True @@ -13690,9 +12461,8 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: match. Note that 'falcon' does not match based on the number of legs in other. - >>> other = pd.DataFrame( - ... {"num_legs": [8, 3], "num_wings": [0, 2]}, index=["spider", "falcon"] - ... ) + >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, + ... index=['spider', 'falcon']) >>> df.isin(other) num_legs num_wings falcon False True @@ -13803,41 +12573,29 @@ def isin_(x): ) columns = properties.AxisProperty( axis=0, - doc=""" - The column labels of the DataFrame. - - This property holds the column names as a pandas ``Index`` object. - It provides an immutable sequence of column labels that can be - used for data selection, renaming, and alignment in DataFrame operations. - - Returns - ------- - pandas.Index - The column labels of the DataFrame. - - See Also - -------- - DataFrame.index: The index (row labels) of the DataFrame. - DataFrame.axes: Return a list representing the axes of the DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) - >>> df - A B - 0 1 3 - 1 2 4 - >>> df.columns - Index(['A', 'B'], dtype='object') - """, + doc=dedent( + """ + The column labels of the DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df + A B + 0 1 3 + 1 2 4 + >>> df.columns + Index(['A', 'B'], dtype='object') + """ + ), ) # ---------------------------------------------------------------------- # Add plotting methods to DataFrame - plot = Accessor("plot", pandas.plotting.PlotAccessor) + plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) hist = pandas.plotting.hist_frame boxplot = pandas.plotting.boxplot_frame - sparse = Accessor("sparse", SparseFrameAccessor) + sparse = CachedAccessor("sparse", SparseFrameAccessor) # ---------------------------------------------------------------------- # Internal Interface Methods @@ -13847,12 +12605,14 @@ def _to_dict_of_blocks(self): Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. - Internal ONLY. + Internal ONLY - only works for BlockManager """ mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well + mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v in mgr.to_iter_dict() + for k, v, in mgr.to_dict().items() } @property @@ -13895,9 +12655,9 @@ def values(self) -> np.ndarray: A DataFrame where all columns are the same type (e.g., int64) results in an array of the same type. - >>> df = pd.DataFrame( - ... {"age": [3, 29], "height": [94, 170], "weight": [31, 115]} - ... ) + >>> df = pd.DataFrame({'age': [ 3, 29], + ... 'height': [94, 170], + ... 'weight': [31, 115]}) >>> df age height weight 0 3 94 31 @@ -13915,14 +12675,10 @@ def values(self) -> np.ndarray: results in an ndarray of the broadest type that accommodates these mixed types (e.g., object). - >>> df2 = pd.DataFrame( - ... [ - ... ("parrot", 24.0, "second"), - ... ("lion", 80.5, 1), - ... ("monkey", np.nan, None), - ... ], - ... columns=("name", "max_speed", "rank"), - ... ) + >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), + ... ('lion', 80.5, 1), + ... ('monkey', np.nan, None)], + ... columns=('name', 'max_speed', 'rank')) >>> df2.dtypes name object max_speed float64 @@ -13936,12 +12692,8 @@ def values(self) -> np.ndarray: return self._mgr.as_array() -def _from_nested_dict( - data: Mapping[HashableT, Mapping[HashableT2, T]], -) -> collections.defaultdict[HashableT2, dict[HashableT, T]]: - new_data: collections.defaultdict[HashableT2, dict[HashableT, T]] = ( - collections.defaultdict(dict) - ) +def _from_nested_dict(data) -> collections.defaultdict: + new_data: collections.defaultdict = collections.defaultdict(dict) for index, s in data.items(): for col, v in s.items(): new_data[col][index] = v @@ -13954,7 +12706,7 @@ def _reindex_for_setitem( # reindex if necessary if value.index.equals(index) or not len(index): - if isinstance(value, Series): + if using_copy_on_write() and isinstance(value, Series): return value._values, value._references return value._values.copy(), None @@ -13971,3 +12723,4 @@ def _reindex_for_setitem( "incompatible index of inserted column with frame index" ) from err return reindexed_value, None +# confirming git trackSS \ No newline at end of file diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a7a287de0241e..8aae4609b1833 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -612,7 +612,6 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: v, copy=False, index=self.index, name=k, dtype=dtype ).__finalize__(self) for k, v, dtype in zip(self.columns, self._iter_column_arrays(), dtypes) - if not isinstance(k, int) } @final @@ -887,7 +886,7 @@ def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: dtype: int64 >>> even_primes.squeeze() - 2 + np.int64(2) Squeezing objects with more than one value in every axis does nothing: @@ -945,7 +944,7 @@ def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: Squeezing all axes will project directly into a scalar: >>> df_0a.squeeze() - 1 + np.int64(1) """ axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),) result = self.iloc[ @@ -1646,11 +1645,7 @@ def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool: axis_int = self._get_axis_number(axis) other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int) - return ( - key is not None - and is_hashable(key) - and any(key in self.axes[ax] for ax in other_axes) - ) + return is_hashable(key) and any(key in self.axes[ax] for ax in other_axes) @final def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool: @@ -3569,6 +3564,7 @@ def _wrap(x, alt_format_): elif formatters is None and float_format is not None: formatters_ = partial(_wrap, alt_format_=lambda v: v) format_index_ = [index_format_, column_format_] + format_index_names_ = [index_format_, column_format_] # Deal with hiding indexes and relabelling column names hide_: list[dict] = [] @@ -3617,6 +3613,7 @@ def _wrap(x, alt_format_): relabel_index=relabel_index_, format={"formatter": formatters_, **base_format_}, format_index=format_index_, + format_index_names=format_index_names_, render_kwargs=render_kwargs_, ) @@ -3629,6 +3626,7 @@ def _to_latex_via_styler( relabel_index: dict | list[dict] | None = None, format: dict | list[dict] | None = None, format_index: dict | list[dict] | None = None, + format_index_names: dict | list[dict] | None = None, render_kwargs: dict | None = None, ): """ @@ -3673,7 +3671,13 @@ def _to_latex_via_styler( self = cast("DataFrame", self) styler = Styler(self, uuid="") - for kw_name in ["hide", "relabel_index", "format", "format_index"]: + for kw_name in [ + "hide", + "relabel_index", + "format", + "format_index", + "format_index_names", + ]: kw = vars()[kw_name] if isinstance(kw, dict): getattr(styler, kw_name)(**kw) @@ -3956,7 +3960,7 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Self: ---------- indices : array-like An array of ints indicating which positions to take. - axis : {0 or 'index', 1 or 'columns', None}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis on which to select elements. ``0`` means that we are selecting rows, ``1`` means that we are selecting columns. For `Series` this parameter is unused and defaults to 0. @@ -6077,8 +6081,10 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: # One could make the deepcopy unconditionally, but a deepcopy # of an empty dict is 50x more expensive than the empty check. self.attrs = deepcopy(other.attrs) - - self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels + self.flags.allows_duplicate_labels = ( + self.flags.allows_duplicate_labels + and other.flags.allows_duplicate_labels + ) # For subclasses using _metadata. for name in set(self._metadata) & set(other._metadata): assert isinstance(name, str) @@ -6809,12 +6815,12 @@ def convert_dtypes( 2 3 z 20 200.0 >>> dfn.dtypes - a Int32 - b string[python] - c boolean - d string[python] - e Int64 - f Float64 + a Int32 + b string + c boolean + d string + e Int64 + f Float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -7954,7 +7960,7 @@ def asof(self, where, subset=None): dtype: float64 >>> s.asof(20) - 2.0 + np.float64(2.0) For a sequence `where`, a Series is returned. The first value is NaN, because the first element of `where` is before the first @@ -7969,7 +7975,7 @@ def asof(self, where, subset=None): NaN, even though NaN is at the index location for ``30``. >>> s.asof(30) - 2.0 + np.float64(2.0) Take all columns into consideration @@ -9704,7 +9710,7 @@ def _where( # CoW: Make sure reference is not kept alive if cond.ndim == 1 and self.ndim == 2: cond = cond._constructor_expanddim( - {i: cond for i in range(len(self.columns))}, + dict.fromkeys(range(len(self.columns)), cond), copy=False, ) cond.columns = self.columns diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1251403db6ff3..49b80337c700e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -504,11 +504,13 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # inference. We default to using the existing dtype. # xref GH#51445 obj = self._obj_with_exclusions - return self.obj._constructor( - [], - name=self.obj.name, - index=self._grouper.result_index, - dtype=obj.dtype, + return self._wrap_aggregated_output( + self.obj._constructor( + [], + name=self.obj.name, + index=self._grouper.result_index, + dtype=obj.dtype, + ) ) return self._python_agg_general(func, *args, **kwargs) @@ -2505,7 +2507,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: ) results = [func(sgb) for sgb in sgbs] - if not len(results): + if not results: # concat would raise res_df = DataFrame([], columns=columns, index=self._grouper.result_index) else: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d0c0ed29b6d44..3daee98371844 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -81,6 +81,7 @@ class providing the base-class of operations. is_numeric_dtype, is_object_dtype, is_scalar, + is_string_dtype, needs_i8_conversion, pandas_dtype, ) @@ -141,6 +142,7 @@ class providing the base-class of operations. if TYPE_CHECKING: from pandas._libs.tslibs import BaseOffset + from pandas._libs.tslibs.timedeltas import Timedelta from pandas._typing import ( Any, Concatenate, @@ -546,7 +548,8 @@ def groups(self) -> dict[Hashable, Index]: 2023-02-15 4 dtype: int64 >>> ser.resample("MS").groups - {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} + {Timestamp('2023-01-01 00:00:00'): np.int64(2), + Timestamp('2023-02-01 00:00:00'): np.int64(4)} """ if isinstance(self.keys, list) and len(self.keys) == 1: warnings.warn( @@ -613,7 +616,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: toucan 1 5 6 eagle 7 8 9 >>> df.groupby(by=["a"]).indices - {1: array([0, 1]), 7: array([2])} + {np.int64(1): array([0, 1]), np.int64(7): array([2])} For Resampler: @@ -1723,8 +1726,13 @@ def _agg_py_fallback( # preserve the kind of exception that raised raise type(err)(msg) from err - if ser.dtype == object: + dtype = ser.dtype + if dtype == object: res_values = res_values.astype(object, copy=False) + elif is_string_dtype(dtype): + # mypy doesn't infer dtype is an ExtensionDtype + string_array_cls = dtype.construct_array_type() # type: ignore[union-attr] + res_values = string_array_cls._from_sequence(res_values, dtype=dtype) # If we are DataFrameGroupBy and went through a SeriesGroupByPath # then we need to reshape @@ -1877,7 +1885,7 @@ def _apply_filter(self, indices, dropna): mask.fill(False) mask[indices.astype(int)] = True # mask fails to broadcast when passed to where; broadcast manually. - mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T # type: ignore[assignment] filtered = self._selected_obj.where(mask) # Fill with NaNs. return filtered @@ -3802,44 +3810,179 @@ def rolling( ) @final - @Substitution(name="groupby") - @Appender(_common_see_also) - def expanding(self, *args, **kwargs) -> ExpandingGroupby: + def expanding( + self, + min_periods: int = 1, + method: str = "single", + ) -> ExpandingGroupby: """ - Return an expanding grouper, providing expanding - functionality per group. + Return an expanding grouper, providing expanding functionality per group. + + Parameters + ---------- + min_periods : int, default 1 + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + method : str {'single', 'table'}, default 'single' + Execute the expanding operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. Returns ------- pandas.api.typing.ExpandingGroupby + An object that supports expanding transformations over each group. + + See Also + -------- + Series.expanding : Expanding transformations for Series. + DataFrame.expanding : Expanding transformations for DataFrames. + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "Class": ["A", "A", "A", "B", "B", "B"], + ... "Value": [10, 20, 30, 40, 50, 60], + ... } + ... ) + >>> df + Class Value + 0 A 10 + 1 A 20 + 2 A 30 + 3 B 40 + 4 B 50 + 5 B 60 + + >>> df.groupby("Class").expanding().mean() + Value + Class + A 0 10.0 + 1 15.0 + 2 20.0 + B 3 40.0 + 4 45.0 + 5 50.0 """ from pandas.core.window import ExpandingGroupby return ExpandingGroupby( self._selected_obj, - *args, + min_periods=min_periods, + method=method, _grouper=self._grouper, - **kwargs, ) @final - @Substitution(name="groupby") - @Appender(_common_see_also) - def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: + def ewm( + self, + com: float | None = None, + span: float | None = None, + halflife: float | str | Timedelta | None = None, + alpha: float | None = None, + min_periods: int | None = 0, + adjust: bool = True, + ignore_na: bool = False, + times: np.ndarray | Series | None = None, + method: str = "single", + ) -> ExponentialMovingWindowGroupby: """ Return an ewm grouper, providing ewm functionality per group. + Parameters + ---------- + com : float, optional + Specify decay in terms of center of mass. + Alternative to ``span``, ``halflife``, and ``alpha``. + + span : float, optional + Specify decay in terms of span. + + halflife : float, str, or Timedelta, optional + Specify decay in terms of half-life. + + alpha : float, optional + Specify smoothing factor directly. + + min_periods : int, default 0 + Minimum number of observations in the window required to have a value; + otherwise, result is ``np.nan``. + + adjust : bool, default True + Divide by decaying adjustment factor to account for imbalance in + relative weights. + + ignore_na : bool, default False + Ignore missing values when calculating weights. + + times : str or array-like of datetime64, optional + Times corresponding to the observations. + + method : {'single', 'table'}, default 'single' + Execute the operation per group independently (``'single'``) or over the + entire object before regrouping (``'table'``). Only applicable to + ``mean()``, and only when using ``engine='numba'``. + Returns ------- pandas.api.typing.ExponentialMovingWindowGroupby + An object that supports exponentially weighted moving transformations over + each group. + + See Also + -------- + Series.ewm : EWM transformations for Series. + DataFrame.ewm : EWM transformations for DataFrames. + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "Class": ["A", "A", "A", "B", "B", "B"], + ... "Value": [10, 20, 30, 40, 50, 60], + ... } + ... ) + >>> df + Class Value + 0 A 10 + 1 A 20 + 2 A 30 + 3 B 40 + 4 B 50 + 5 B 60 + + >>> df.groupby("Class").ewm(com=0.5).mean() + Value + Class + A 0 10.000000 + 1 17.500000 + 2 26.153846 + B 3 40.000000 + 4 47.500000 + 5 56.153846 """ from pandas.core.window import ExponentialMovingWindowGroupby return ExponentialMovingWindowGroupby( self._selected_obj, - *args, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + times=times, + method=method, _grouper=self._grouper, - **kwargs, ) @final @@ -4440,11 +4583,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: ) if vals.ndim == 1: - out = out.ravel("K") + out = out.ravel("K") # type: ignore[assignment] if result_mask is not None: - result_mask = result_mask.ravel("K") + result_mask = result_mask.ravel("K") # type: ignore[assignment] else: - out = out.reshape(ncols, ngroups * nqs) + out = out.reshape(ncols, ngroups * nqs) # type: ignore[assignment] return post_processor(out, inference, result_mask, orig_vals) @@ -5174,8 +5317,8 @@ def diff( shifted = shifted.astype("float32") else: to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] - if len(to_coerce): - shifted = shifted.astype({c: "float32" for c in to_coerce}) + if to_coerce: + shifted = shifted.astype(dict.fromkeys(to_coerce, "float32")) return obj - shifted diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c9d874fc08dbe..f8e92b7e2650a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -12,11 +12,16 @@ import numpy as np +from pandas._libs import ( + algos as libalgos, +) from pandas._libs.tslibs import OutOfBoundsDatetime from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, is_list_like, is_scalar, ) @@ -38,7 +43,10 @@ ) from pandas.core.series import Series -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import ( + PrettyDict, + pprint_thing, +) if TYPE_CHECKING: from collections.abc import ( @@ -668,8 +676,14 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: def groups(self) -> dict[Hashable, Index]: codes, uniques = self._codes_and_uniques uniques = Index._with_infer(uniques, name=self.name) - cats = Categorical.from_codes(codes, uniques, validate=False) - return self._index.groupby(cats) + + r, counts = libalgos.groupsort_indexer(ensure_platform_int(codes), len(uniques)) + counts = ensure_int64(counts).cumsum() + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + # map to the label + result = {k: self._index.take(v) for k, v in zip(uniques, _result)} + + return PrettyDict(result) @property def observed_grouping(self) -> Grouping: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c4c7f73ee166c..75f3495041917 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1131,7 +1131,7 @@ def get_iterator(self, data: NDFrame): """ slicer = lambda start, edge: data.iloc[start:edge] - start = 0 + start: np.int64 | int = 0 for edge, label in zip(self.bins, self.binlabels): if label is not NaT: yield label, slicer(start, edge) @@ -1144,7 +1144,7 @@ def get_iterator(self, data: NDFrame): def indices(self): indices = collections.defaultdict(list) - i = 0 + i: np.int64 | int = 0 for label, bin in zip(self.binlabels, self.bins): if i < bin: if label is not NaT: diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 88379164534f2..6fc638e85bc5e 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -131,8 +131,8 @@ def get_window_bounds( if closed in ["left", "neither"]: end -= 1 - end = np.clip(end, 0, num_values) - start = np.clip(start, 0, num_values) + end = np.clip(end, 0, num_values) # type: ignore[assignment] + start = np.clip(start, 0, num_values) # type: ignore[assignment] return start, end @@ -402,7 +402,7 @@ def get_window_bounds( start = np.arange(0, num_values, step, dtype="int64") end = start + self.window_size if self.window_size: - end = np.clip(end, 0, num_values) + end = np.clip(end, 0, num_values) # type: ignore[assignment] return start, end @@ -488,7 +488,7 @@ def get_window_bounds( ) window_indices_start += len(indices) # Extend as we'll be slicing window like [start, end) - window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( + window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( # type: ignore[assignment] np.int64, copy=False ) start_arrays.append(window_indices.take(ensure_platform_int(start))) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2079ff8fd2873..4e1ea07907cdb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1731,10 +1731,16 @@ def name(self) -> Hashable: """ Return Index or MultiIndex name. + Returns + ------- + label (hashable object) + The name of the Index. + See Also -------- Index.set_names: Able to set new names partially and by level. Index.rename: Able to set new names partially and by level. + Series.name: Corresponding Series property. Examples -------- @@ -1907,12 +1913,12 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: Parameters ---------- - names : label or list of label or dict-like for MultiIndex + names : Hashable or a sequence of the previous or dict-like for MultiIndex Name(s) to set. .. versionchanged:: 1.3.0 - level : int, label or list of int or label, optional + level : int, Hashable or a sequence of the previous, optional If the index is a MultiIndex and names is not dict-like, level(s) to set (None for all levels). Otherwise level must be None. @@ -2017,7 +2023,7 @@ def rename(self, name, *, inplace: bool = False) -> Self | None: Parameters ---------- - name : label or list of labels + name : Hashable or a sequence of the previous Name(s) to set. inplace : bool, default False Modifies the object directly, instead of creating a new Index or @@ -2961,10 +2967,14 @@ def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index and self.tz is not None and other.tz is not None ): - # GH#39328, GH#45357 - left = self.tz_convert("UTC") - right = other.tz_convert("UTC") - return left, right + # GH#39328, GH#45357, GH#60080 + # If both timezones are the same, no need to convert to UTC + if self.tz == other.tz: + return self, other + else: + left = self.tz_convert("UTC") + right = other.tz_convert("UTC") + return left, right return self, other @final @@ -7148,10 +7158,10 @@ def _logical_method(self, other, op): rvalues = extract_array(other, extract_numpy=True, extract_range=True) res_values = ops.logical_op(lvalues, rvalues, op) - return self._construct_result(res_values, name=res_name) + return self._construct_result(res_values, name=res_name, other=other) @final - def _construct_result(self, result, name): + def _construct_result(self, result, name, other): if isinstance(result, tuple): return ( Index(result[0], name=name, dtype=result[0].dtype), diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 13811c28e6c1e..8c40b630e8cfd 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1279,14 +1279,7 @@ def interval_range( breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com.not_none(start, end, freq)): # np.linspace always produces float output - - # error: Argument 1 to "maybe_downcast_numeric" has incompatible type - # "Union[ndarray[Any, Any], TimedeltaIndex, DatetimeIndex]"; - # expected "ndarray[Any, Any]" [ - breaks = maybe_downcast_numeric( - breaks, # type: ignore[arg-type] - dtype, - ) + breaks = maybe_downcast_numeric(breaks, dtype) else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bcb27d0320c91..34a437ba40bd8 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -265,7 +265,7 @@ def iloc(self) -> _iLocIndexer: With scalar integers. >>> df.iloc[0, 1] - 2 + np.int64(2) With lists of integers. @@ -375,7 +375,7 @@ def loc(self) -> _LocIndexer: Single label for row and column >>> df.loc["cobra", "shield"] - 2 + np.int64(2) Slice with labels for row and single label for column. As mentioned above, note that both the start and stop of the slice are included. @@ -585,7 +585,7 @@ def loc(self) -> _LocIndexer: Single tuple for the index with a single label for the column >>> df.loc[("cobra", "mark i"), "shield"] - 2 + np.int64(2) Slice from index tuple to single label @@ -666,18 +666,18 @@ def at(self) -> _AtIndexer: Get value at specified row/column pair >>> df.at[4, "B"] - 2 + np.int64(2) Set value at specified row/column pair >>> df.at[4, "B"] = 10 >>> df.at[4, "B"] - 10 + np.int64(10) Get value within a Series >>> df.loc[5].at["B"] - 4 + np.int64(4) """ return _AtIndexer("at", self) @@ -715,18 +715,18 @@ def iat(self) -> _iAtIndexer: Get value at specified row/column pair >>> df.iat[1, 2] - 1 + np.int64(1) Set value at specified row/column pair >>> df.iat[1, 2] = 10 >>> df.iat[1, 2] - 10 + np.int64(10) Get value within a series >>> df.loc[0].iat[1] - 2 + np.int64(2) """ return _iAtIndexer("iat", self) @@ -1582,11 +1582,7 @@ def _validate_key(self, key, axis: AxisInt) -> None: if com.is_bool_indexer(key): if hasattr(key, "index") and isinstance(key.index, Index): if key.index.inferred_type == "integer": - raise NotImplementedError( - "iLocation based boolean " - "indexing on an integer type " - "is not available" - ) + return raise ValueError( "iLocation based boolean indexing cannot use an indexable as a mask" ) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index b990eca39b3dd..c2fbef1089d5a 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -563,7 +563,6 @@ def set_nulls( if null_kind == ColumnNullType.USE_SENTINEL: null_pos = pd.Series(data) == sentinel_val elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): - assert validity, "Expected to have a validity buffer for the mask" valid_buff, valid_dtype = validity null_pos = buffer_to_ndarray( valid_buff, valid_dtype, offset=col.offset, length=col.size() diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 202bebde88c2c..d64c7e33657d4 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -6,9 +6,9 @@ ) __all__ = [ - "Block", + "Block", # pyright:ignore[reportUnsupportedDunderAll)] "BlockManager", - "ExtensionBlock", + "ExtensionBlock", # pyright:ignore[reportUnsupportedDunderAll)] "SingleBlockManager", "concatenate_managers", "make_block", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index dc64da35e9725..6aa5062b8ed86 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -805,7 +805,7 @@ def replace_list( for x, y in zip(src_list, dest_list) if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) ] - if not len(pairs): + if not pairs: return [self.copy(deep=False)] src_len = len(pairs) - 1 @@ -1679,6 +1679,8 @@ def where(self, other, cond) -> list[Block]: try: res_values = arr._where(cond, other).T + except OutOfBoundsDatetime: + raise except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, (IntervalDtype, StringDtype)): @@ -1746,6 +1748,8 @@ def putmask(self, mask, new) -> list[Block]: try: # Caller is responsible for ensuring matching lengths values._putmask(mask, new) + except OutOfBoundsDatetime: + raise except (TypeError, ValueError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): @@ -2094,7 +2098,7 @@ def _unstack( self.values.take( indices, allow_fill=needs_masking[i], fill_value=fill_value ), - BlockPlacement(place), + BlockPlacement(place), # type: ignore[arg-type] ndim=2, ) for i, (indices, place) in enumerate(zip(new_values, new_placement)) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 69da2be0306f6..35de97d570bd3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -634,7 +634,7 @@ def reorder_arrays( arr = np.empty(length, dtype=object) arr.fill(np.nan) else: - arr = arrays[k] + arr = arrays[k] # type: ignore[assignment] new_arrays.append(arr) arrays = new_arrays @@ -864,7 +864,7 @@ def _finalize_columns_and_data( # GH#26429 do not raise user-facing AssertionError raise ValueError(err) from err - if len(contents) and contents[0].dtype == np.object_: + if contents and contents[0].dtype == np.object_: contents = convert_object_array(contents, dtype=dtype) return contents, columns diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a3738bb25f56c..cb290fde7095c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1298,7 +1298,7 @@ def value_getitem(placement): # Defer setting the new values to enable consolidation self._iset_split_block(blkno_l, blk_locs, refs=refs) - if len(removed_blknos): + if removed_blknos: # Remove blocks & update blknos accordingly is_deleted = np.zeros(self.nblocks, dtype=np.bool_) is_deleted[removed_blknos] = True @@ -1800,6 +1800,8 @@ def as_array( arr = np.asarray(blk.values, dtype=dtype) else: arr = np.array(blk.values, dtype=dtype, copy=copy) + if passed_nan and blk.dtype.kind in "mM": + arr[isna(blk.values)] = na_value if not copy: arr = arr.view() @@ -1865,6 +1867,8 @@ def _interleave( else: arr = blk.get_values(dtype) result[rl.indexer] = arr + if na_value is not lib.no_default and blk.dtype.kind in "mM": + result[rl.indexer][isna(arr)] = na_value itemmask[rl.indexer] = 1 if not itemmask.all(): diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 02e7445f1d275..59516b16905dc 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -11,6 +11,7 @@ from typing import ( TYPE_CHECKING, Generic, + Literal, cast, final, ) @@ -54,7 +55,9 @@ class SelectN(Generic[NDFrameT]): - def __init__(self, obj: NDFrameT, n: int, keep: str) -> None: + def __init__( + self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"] + ) -> None: self.obj = obj self.n = n self.keep = keep @@ -111,15 +114,25 @@ def compute(self, method: str) -> Series: if n <= 0: return self.obj[[]] - dropped = self.obj.dropna() - nan_index = self.obj.drop(dropped.index) + # Save index and reset to default index to avoid performance impact + # from when index contains duplicates + original_index: Index = self.obj.index + default_index = self.obj.reset_index(drop=True) - # slow method - if n >= len(self.obj): + # Slower method used when taking the full length of the series + # In this case, it is equivalent to a sort. + if n >= len(default_index): ascending = method == "nsmallest" - return self.obj.sort_values(ascending=ascending).head(n) + result = default_index.sort_values(ascending=ascending, kind="stable").head( + n + ) + result.index = original_index.take(result.index) + return result + + # Fast method used in the general case + dropped = default_index.dropna() + nan_index = default_index.drop(dropped.index) - # fast method new_dtype = dropped.dtype # Similar to algorithms._ensure_data @@ -158,7 +171,7 @@ def compute(self, method: str) -> Series: else: kth_val = np.nan (ns,) = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind="mergesort")] + inds = ns[arr[ns].argsort(kind="stable")] if self.keep != "all": inds = inds[:n] @@ -173,7 +186,9 @@ def compute(self, method: str) -> Series: # reverse indices inds = narr - 1 - inds - return concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result = concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result.index = original_index.take(result.index) + return result class SelectNFrame(SelectN[DataFrame]): @@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]): nordered : DataFrame """ - def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None: + def __init__( + self, + obj: DataFrame, + n: int, + keep: Literal["first", "last", "all"], + columns: IndexLabel, + ) -> None: super().__init__(obj, n, keep) if not is_list_like(columns) or isinstance(columns, tuple): columns = [columns] @@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: ascending = method == "nsmallest" - return frame.sort_values(columns, ascending=ascending, kind="mergesort") + return frame.sort_values(columns, ascending=ascending, kind="stable") diff --git a/pandas/core/missing.py b/pandas/core/missing.py index ff2daae002731..66609fa870f14 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -241,7 +241,8 @@ def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None: return None if is_valid.ndim == 2: - is_valid = is_valid.any(axis=1) # reduce axis 1 + # reduce axis 1 + is_valid = is_valid.any(axis=1) # type: ignore[assignment] if how == "first": idxpos = is_valid[::].argmax() @@ -312,18 +313,9 @@ def get_interp_index(method, index: Index) -> Index: # create/use the index if method == "linear": # prior default - from pandas import Index - - if isinstance(index.dtype, DatetimeTZDtype) or lib.is_np_dtype( - index.dtype, "mM" - ): - # Convert datetime-like indexes to int64 - index = Index(index.view("i8")) - - elif not is_numeric_dtype(index.dtype): - # We keep behavior consistent with prior versions of pandas for - # non-numeric, non-datetime indexes - index = Index(range(len(index))) + from pandas import RangeIndex + + index = RangeIndex(len(index)) else: methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( @@ -413,10 +405,7 @@ def func(yvalues: np.ndarray) -> None: **kwargs, ) - # error: No overload variant of "apply_along_axis" matches - # argument types "Callable[[ndarray[Any, Any]], None]", - # "int", "ndarray[Any, Any]" - np.apply_along_axis(func, axis, data) # type: ignore[call-overload] + np.apply_along_axis(func, axis, data) def _index_to_interp_indices(index: Index, method: str) -> np.ndarray: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d1dc0ff809497..666b108717837 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -508,12 +508,12 @@ def nanany( >>> from pandas.core import nanops >>> s = pd.Series([1, 2]) >>> nanops.nanany(s.values) - True + np.True_ >>> from pandas.core import nanops >>> s = pd.Series([np.nan]) >>> nanops.nanany(s.values) - False + np.False_ """ if values.dtype.kind in "iub" and mask is None: # GH#26032 fastpath @@ -564,12 +564,12 @@ def nanall( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanall(s.values) - True + np.True_ >>> from pandas.core import nanops >>> s = pd.Series([1, 0]) >>> nanops.nanall(s.values) - False + np.False_ """ if values.dtype.kind in "iub" and mask is None: # GH#26032 fastpath @@ -625,7 +625,7 @@ def nansum( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nansum(s.values) - 3.0 + np.float64(3.0) """ dtype = values.dtype values, mask = _get_values(values, skipna, fill_value=0, mask=mask) @@ -691,7 +691,7 @@ def nanmean( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanmean(s.values) - 1.5 + np.float64(1.5) """ dtype = values.dtype values, mask = _get_values(values, skipna, fill_value=0, mask=mask) @@ -1014,7 +1014,11 @@ def nanvar( avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) - sqr = _ensure_numeric((avg - values) ** 2) + if values.dtype.kind == "c": + # Need to use absolute value for complex numbers. + sqr = _ensure_numeric(abs(avg - values) ** 2) + else: + sqr = _ensure_numeric((avg - values) ** 2) if mask is not None: np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d @@ -1061,7 +1065,7 @@ def nansem( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 2, 3]) >>> nanops.nansem(s.values) - 0.5773502691896258 + np.float64(0.5773502691896258) """ # This checks if non-numeric-like data is passed with numeric_only=False # and raises a TypeError otherwise @@ -1136,7 +1140,7 @@ def nanargmax( >>> from pandas.core import nanops >>> arr = np.array([1, 2, 3, np.nan, 4]) >>> nanops.nanargmax(arr) - 4 + np.int64(4) >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3) >>> arr[2:, 2] = np.nan @@ -1182,7 +1186,7 @@ def nanargmin( >>> from pandas.core import nanops >>> arr = np.array([1, 2, 3, np.nan, 4]) >>> nanops.nanargmin(arr) - 0 + np.int64(0) >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3) >>> arr[2:, 0] = np.nan @@ -1237,7 +1241,7 @@ def nanskew( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 1, 2]) >>> nanops.nanskew(s.values) - 1.7320508075688787 + np.float64(1.7320508075688787) """ mask = _maybe_get_mask(values, skipna, mask) if values.dtype.kind != "f": @@ -1325,7 +1329,7 @@ def nankurt( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 1, 3, 2]) >>> nanops.nankurt(s.values) - -1.2892561983471076 + np.float64(-1.2892561983471076) """ mask = _maybe_get_mask(values, skipna, mask) if values.dtype.kind != "f": @@ -1417,7 +1421,7 @@ def nanprod( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, 3, np.nan]) >>> nanops.nanprod(s.values) - 6.0 + np.float64(6.0) """ mask = _maybe_get_mask(values, skipna, mask) diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py index 395db1617cb63..62aa79a881717 100644 --- a/pandas/core/ops/invalid.py +++ b/pandas/core/ops/invalid.py @@ -25,7 +25,7 @@ def invalid_comparison( left: ArrayLike, - right: ArrayLike | Scalar, + right: ArrayLike | list | Scalar, op: Callable[[Any, Any], bool], ) -> npt.NDArray[np.bool_]: """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1d27687d15af0..08e3beef99e60 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -518,6 +518,7 @@ def _wrap_result(self, result): if self._timegrouper._arrow_dtype is not None: result.index = result.index.astype(self._timegrouper._arrow_dtype) + result.index.name = self.obj.index.name return result @@ -897,17 +898,17 @@ def interpolate( to non-aligned timestamps, as in the following example: >>> series.resample("400ms").interpolate("linear") - 2023-03-01 07:00:00.000 1.0 - 2023-03-01 07:00:00.400 0.2 - 2023-03-01 07:00:00.800 -0.6 - 2023-03-01 07:00:01.200 -0.4 - 2023-03-01 07:00:01.600 0.8 - 2023-03-01 07:00:02.000 2.0 - 2023-03-01 07:00:02.400 1.6 - 2023-03-01 07:00:02.800 1.2 - 2023-03-01 07:00:03.200 1.4 - 2023-03-01 07:00:03.600 2.2 - 2023-03-01 07:00:04.000 3.0 + 2023-03-01 07:00:00.000 1.000000 + 2023-03-01 07:00:00.400 0.333333 + 2023-03-01 07:00:00.800 -0.333333 + 2023-03-01 07:00:01.200 0.000000 + 2023-03-01 07:00:01.600 1.000000 + 2023-03-01 07:00:02.000 2.000000 + 2023-03-01 07:00:02.400 1.666667 + 2023-03-01 07:00:02.800 1.333333 + 2023-03-01 07:00:03.200 1.666667 + 2023-03-01 07:00:03.600 2.333333 + 2023-03-01 07:00:04.000 3.000000 Freq: 400ms, dtype: float64 Note that the series correctly decreases between two anchors diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 6a590ee5b227e..ad4a5db441b89 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -60,13 +60,15 @@ def get_dummies( data : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, list of str, or dict of str, default None - String to append DataFrame column names. + A string to be prepended to DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. - prefix_sep : str, default '_' - If appending prefix, separator/delimiter to use. Or pass a - list or dictionary as with `prefix`. + prefix_sep : str, list of str, or dict of str, default '_' + Should you choose to prepend DataFrame column names with a prefix, this + is the separator/delimiter to use between the two. Alternatively, + `prefix_sep` can be a list with length equal to the number of columns, + or a dictionary mapping column names to separators. dummy_na : bool, default False If True, a NaN indicator column will be added even if no NaN values are present. If False, NA values are encoded as all zero. @@ -357,7 +359,7 @@ def get_empty_frame(data) -> DataFrame: if drop_first: # remove first GH12042 - dummy_mat = dummy_mat[:, 1:] + dummy_mat = dummy_mat[:, 1:] # type: ignore[assignment] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index f4cb82816bbcf..20b4cd2185bb4 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -182,6 +182,10 @@ def melt( value_vars_was_not_none = value_vars is not None value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns) + # GH61475 - prevent AttributeError when duplicate column in id_vars + if len(frame.columns.get_indexer_for(id_vars)) > len(id_vars): + raise ValueError("id_vars cannot contain duplicate columns.") + if id_vars or value_vars: if col_level is not None: level = frame.columns.get_level_values(col_level) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 09be82c59a5c6..34f3e2c626378 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -198,15 +198,15 @@ def merge( to SQL left anti join; preserve key order. * right_anti: use only keys from right frame that are not in left frame, similar to SQL right anti join; preserve key order. - on : label or list + on : Hashable or a sequence of the previous Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults to the intersection of the columns in both DataFrames. - left_on : label or list, or array-like + left_on : Hashable or a sequence of the previous, or array-like Column or index level names to join on in the left DataFrame. Can also be an array or list of arrays of the length of the left DataFrame. These arrays are treated as if they are columns. - right_on : label or list, or array-like + right_on : Hashable or a sequence of the previous, or array-like Column or index level names to join on in the right DataFrame. Can also be an array or list of arrays of the length of the right DataFrame. These arrays are treated as if they are columns. @@ -536,13 +536,13 @@ def merge_ordered( First pandas object to merge. right : DataFrame or named Series Second pandas object to merge. - on : label or list + on : Hashable or a sequence of the previous Field names to join on. Must be found in both DataFrames. - left_on : label or list, or array-like + left_on : Hashable or a sequence of the previous, or array-like Field names to join on in left DataFrame. Can be a vector or list of vectors of the length of the DataFrame to use a particular vector as the join key instead of columns. - right_on : label or list, or array-like + right_on : Hashable or a sequence of the previous, or array-like Field names to join on in right DataFrame or vector/list of vectors per left_on docs. left_by : column name or list of column names @@ -2921,9 +2921,7 @@ def _convert_arrays_and_get_rizer_klass( lk = lk.astype(dtype, copy=False) rk = rk.astype(dtype, copy=False) if isinstance(lk, BaseMaskedArray): - # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; - # expected type "Type[object]" - klass = _factorizers[lk.dtype.type] # type: ignore[index] + klass = _factorizers[lk.dtype.type] elif isinstance(lk.dtype, ArrowDtype): klass = _factorizers[lk.dtype.numpy_dtype.type] else: @@ -3064,13 +3062,16 @@ def renamer(x, suffix: str | None): if not llabels.is_unique: # Only warn when duplicates are caused because of suffixes, already duplicated # columns in origin should not warn - dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() + dups.extend(llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()) if not rlabels.is_unique: dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) + # Suffix addition creates duplicate to pre-existing column name + dups.extend(llabels.intersection(right.difference(to_rename)).tolist()) + dups.extend(rlabels.intersection(left.difference(to_rename)).tolist()) if dups: raise MergeError( f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " - f"not allowed.", + "not allowed.", ) return llabels, rlabels diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index cfc6f91557781..ac89f19b80a0f 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -76,12 +76,12 @@ def pivot_table( Input pandas DataFrame object. values : list-like or scalar, optional Column or columns to aggregate. - index : column, Grouper, array, or list of the previous + index : column, Grouper, array, or sequence of the previous Keys to group by on the pivot table index. If a list is passed, it can contain any of the other types (except list). If an array is passed, it must be the same length as the data and will be used in the same manner as column values. - columns : column, Grouper, array, or list of the previous + columns : column, Grouper, array, or sequence of the previous Keys to group by on the pivot table column. If a list is passed, it can contain any of the other types (except list). If an array is passed, it must be the same length as the data and will be used in @@ -102,8 +102,11 @@ def pivot_table( on the rows and columns. dropna : bool, default True Do not include columns whose entries are all NaN. If True, - rows with a NaN value in any column will be omitted before - computing margins. + + * rows with an NA value in any column will be omitted before computing margins, + * index/column keys containing NA values will be dropped (see ``dropna`` + parameter in :meth:``DataFrame.groupby``). + margins_name : str, default 'All' Name of the row / column that will contain the totals when margins is True. @@ -333,6 +336,11 @@ def __internal_pivot_table( values = list(values) grouped = data.groupby(keys, observed=observed, sort=sort, dropna=dropna) + if values_passed: + # GH#57876 and GH#61292 + # mypy is not aware `grouped[values]` will always be a DataFrameGroupBy + grouped = grouped[values] # type: ignore[assignment] + agged = grouped.agg(aggfunc, **kwargs) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): @@ -700,11 +708,11 @@ def pivot( ---------- data : DataFrame Input pandas DataFrame object. - columns : str or object or a list of str + columns : Hashable or a sequence of the previous Column to use to make new frame's columns. - index : str or object or a list of str, optional + index : Hashable or a sequence of the previous, optional Column to use to make new frame's index. If not given, uses existing index. - values : str, object or a list of the previous, optional + values : Hashable or a sequence of the previous, optional Column(s) to use for populating new frame's values. If not specified, all remaining columns will be used and the result will have hierarchically indexed columns. diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c60fe71a7ff28..d2a838b616426 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -936,7 +936,20 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) - result = stack_reshape(frame, level, set_levels, stack_cols) + result: Series | DataFrame + if not isinstance(frame.columns, MultiIndex): + # GH#58817 Fast path when we're stacking the columns of a non-MultiIndex. + # When columns are homogeneous EAs, we pass through object + # dtype but this is still slightly faster than the normal path. + if len(frame.columns) > 0 and frame._is_homogeneous_type: + dtype = frame._mgr.blocks[0].dtype + else: + dtype = None + result = frame._constructor_sliced( + frame._values.reshape(-1, order="F"), dtype=dtype + ) + else: + result = stack_reshape(frame, level, set_levels, stack_cols) # Construct the correct MultiIndex by combining the frame's index and # stacked columns. @@ -1018,6 +1031,8 @@ def stack_reshape( ------- The data of behind the stacked DataFrame. """ + # non-MultIndex takes a fast path. + assert isinstance(frame.columns, MultiIndex) # If we need to drop `level` from columns, it needs to be in descending order drop_levnums = sorted(level, reverse=True) @@ -1027,18 +1042,14 @@ def stack_reshape( if len(frame.columns) == 1: data = frame.copy(deep=False) else: - if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple): - # GH#57750 - if the frame is an Index with tuples, .loc below will fail - column_indexer = idx - else: - # Take the data from frame corresponding to this idx value - if len(level) == 1: - idx = (idx,) - gen = iter(idx) - column_indexer = tuple( - next(gen) if k in set_levels else slice(None) - for k in range(frame.columns.nlevels) - ) + # Take the data from frame corresponding to this idx value + if len(level) == 1: + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in set_levels else slice(None) + for k in range(frame.columns.nlevels) + ) data = frame.loc[:, column_indexer] if len(level) < frame.columns.nlevels: diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 5b1c4b6a331f5..4f12563e3c5e2 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -123,7 +123,7 @@ def sample( random_state: np.random.RandomState | np.random.Generator, ) -> np.ndarray: """ - Randomly sample `size` indices in `np.arange(obj_len)` + Randomly sample `size` indices in `np.arange(obj_len)`. Parameters ---------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 258e0100a8558..7a26be875e7b5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -52,6 +52,9 @@ doc, set_module, ) +from pandas.util._exceptions import ( + find_stack_level, +) from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -2511,6 +2514,8 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: dtype: float64 """ nv.validate_round(args, kwargs) + if self.dtype == "object": + raise TypeError("Expected numeric dtype, got object instead.") new_mgr = self._mgr.round(decimals=decimals) return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" @@ -2948,8 +2953,9 @@ def dot(self, other: AnyArrayLike | DataFrame) -> Series | np.ndarray: ) if isinstance(other, ABCDataFrame): + common_type = find_common_type([self.dtypes] + list(other.dtypes)) return self._constructor( - np.dot(lvals, rvals), index=other.columns, copy=False + np.dot(lvals, rvals), index=other.columns, copy=False, dtype=common_type ).__finalize__(self, method="dot") elif isinstance(other, Series): return np.dot(lvals, rvals) @@ -4320,8 +4326,9 @@ def unstack( def map( self, - arg: Callable | Mapping | Series, + func: Callable | Mapping | Series | None = None, na_action: Literal["ignore"] | None = None, + engine: Callable | None = None, **kwargs, ) -> Series: """ @@ -4333,11 +4340,30 @@ def map( Parameters ---------- - arg : function, collections.abc.Mapping subclass or Series - Mapping correspondence. + func : function, collections.abc.Mapping subclass or Series + Function or mapping correspondence. na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the mapping correspondence. + engine : decorator, optional + Choose the execution engine to use to run the function. Only used for + functions. If ``map`` is called with a mapping or ``Series``, an + exception will be raised. If ``engine`` is not provided the function will + be executed by the regular Python interpreter. + + Options include JIT compilers such as Numba, Bodo or Blosc2, which in some + cases can speed up the execution. To use an executor you can provide the + decorators ``numba.jit``, ``numba.njit``, ``bodo.jit`` or ``blosc2.jit``. + You can also provide the decorator with parameters, like + ``numba.jit(nogit=True)``. + + Not all functions can be executed with all execution engines. In general, + JIT compilers will require type stability in the function (no variable + should change data type during the execution). And not all pandas and + NumPy APIs are supported. Check the engine documentation for limitations. + + .. versionadded:: 3.0.0 + **kwargs Additional keyword arguments to pass as keywords arguments to `arg`. @@ -4404,9 +4430,41 @@ def map( 3 I am a rabbit dtype: object """ - if callable(arg): - arg = functools.partial(arg, **kwargs) - new_values = self._map_values(arg, na_action=na_action) + if func is None: + if "arg" in kwargs: + # `.map(arg=my_func)` + func = kwargs.pop("arg") + warnings.warn( + "The parameter `arg` has been renamed to `func`, and it " + "will stop being supported in a future version of pandas.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + raise ValueError("The `func` parameter is required") + + if engine is not None: + if not callable(func): + raise ValueError( + "The engine argument can only be specified when func is a function" + ) + if not hasattr(engine, "__pandas_udf__"): + raise ValueError(f"Not a valid engine: {engine!r}") + result = engine.__pandas_udf__.map( # type: ignore[attr-defined] + data=self, + func=func, + args=(), + kwargs=kwargs, + decorator=engine, + skip_na=na_action == "ignore", + ) + if not isinstance(result, Series): + result = Series(result, index=self.index, name=self.name) + return result.__finalize__(self, method="map") + + if callable(func): + func = functools.partial(func, **kwargs) + new_values = self._map_values(func, na_action=na_action) return self._constructor(new_values, index=self.index, copy=False).__finalize__( self, method="map" ) @@ -5858,7 +5916,7 @@ def _cmp_method(self, other, op): res_values = ops.comparison_op(lvalues, rvalues, op) - return self._construct_result(res_values, name=res_name) + return self._construct_result(res_values, name=res_name, other=other) def _logical_method(self, other, op): res_name = ops.get_op_result_name(self, other) @@ -5868,7 +5926,7 @@ def _logical_method(self, other, op): rvalues = extract_array(other, extract_numpy=True, extract_range=True) res_values = ops.logical_op(lvalues, rvalues, op) - return self._construct_result(res_values, name=res_name) + return self._construct_result(res_values, name=res_name, other=other) def _arith_method(self, other, op): self, other = self._align_for_op(other) @@ -5930,11 +5988,15 @@ def _binop(self, other: Series, func, level=None, fill_value=None) -> Series: result = func(this_vals, other_vals) name = ops.get_op_result_name(self, other) - out = this._construct_result(result, name) + + out = this._construct_result(result, name, other) return cast(Series, out) def _construct_result( - self, result: ArrayLike | tuple[ArrayLike, ArrayLike], name: Hashable + self, + result: ArrayLike | tuple[ArrayLike, ArrayLike], + name: Hashable, + other: AnyArrayLike | DataFrame, ) -> Series | tuple[Series, Series]: """ Construct an appropriately-labelled Series from the result of an op. @@ -5943,6 +6005,7 @@ def _construct_result( ---------- result : ndarray or ExtensionArray name : Label + other : Series, DataFrame or array-like Returns ------- @@ -5952,8 +6015,8 @@ def _construct_result( if isinstance(result, tuple): # produced by divmod or rdivmod - res1 = self._construct_result(result[0], name=name) - res2 = self._construct_result(result[1], name=name) + res1 = self._construct_result(result[0], name=name, other=other) + res2 = self._construct_result(result[1], name=name, other=other) # GH#33427 assertions to keep mypy happy assert isinstance(res1, Series) @@ -5965,6 +6028,7 @@ def _construct_result( dtype = getattr(result, "dtype", None) out = self._constructor(result, index=self.index, dtype=dtype, copy=False) out = out.__finalize__(self) + out = out.__finalize__(other) # Set the result's name after __finalize__ is called because __finalize__ # would set it back to self.name diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 81fa508ae6d23..bf30c215596f2 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -380,7 +380,7 @@ replaced with `value` - str: string exactly matching `to_replace` will be replaced with `value` - - regex: regexs matching `to_replace` will be replaced with + - regex: regexes matching `to_replace` will be replaced with `value` * list of str, regex, or numeric: @@ -388,7 +388,7 @@ - First, if `to_replace` and `value` are both lists, they **must** be the same length. - Second, if ``regex=True`` then all of the strings in **both** - lists will be interpreted as regexs otherwise they will match + lists will be interpreted as regexes otherwise they will match directly. This doesn't matter much for `value` since there are only a few possible substitution regexes you can use. - str, regex and numeric rules apply as above. diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 0d8f42694ccb4..18983af12976c 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -476,7 +476,7 @@ def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0): zipped = zip(arr_values, mask) else: zipped = zip(arr_values.T, mask.T) - return np.array([_nanargminmax(v, m, func) for v, m in zipped]) + return np.array([_nanargminmax(v, m, func) for v, m in zipped]) # type: ignore[arg-type] return func(arr_values, axis=axis) return _nanargminmax(arr_values, mask, func) diff --git a/pandas/io/api.py b/pandas/io/api.py index d4982399a604b..5900c94384384 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -10,6 +10,7 @@ ) from pandas.io.feather_format import read_feather from pandas.io.html import read_html +from pandas.io.iceberg import read_iceberg from pandas.io.json import read_json from pandas.io.orc import read_orc from pandas.io.parquet import read_parquet @@ -47,6 +48,7 @@ "read_fwf", "read_hdf", "read_html", + "read_iceberg", "read_json", "read_orc", "read_parquet", diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ebcafce8f4de2..1dc6c1f08b49a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -89,7 +89,7 @@ ) _read_excel_doc = ( """ -Read an Excel file into a ``pandas`` ``DataFrame``. +Read an Excel file into a ``DataFrame``. Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions read from a local filesystem or URL. Supports an option to read diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 75bcb51ef4be2..1b9eb6303fe74 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -90,9 +90,9 @@ def __init__( self.index_label = self._initialize_index_label(index_label) self.errors = errors self.quoting = quoting or csvlib.QUOTE_MINIMAL - self.quotechar = self._initialize_quotechar(quotechar) self.doublequote = doublequote self.escapechar = escapechar + self.quotechar = self._initialize_quotechar(quotechar) self.lineterminator = lineterminator or os.linesep self.date_format = date_format self.cols = self._initialize_columns(cols) @@ -141,7 +141,7 @@ def _get_index_label_flat(self) -> Sequence[Hashable]: return [""] if index_label is None else [index_label] def _initialize_quotechar(self, quotechar: str | None) -> str | None: - if self.quoting != csvlib.QUOTE_NONE: + if self.quoting != csvlib.QUOTE_NONE or self.escapechar is not None: # prevents crash in _csv return quotechar return None diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index fb799361fea67..097e508d4889a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -67,7 +67,6 @@ ExtensionArray, TimedeltaArray, ) -from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.indexes.api import ( @@ -115,7 +114,7 @@ columns : array-like, optional, default None The subset of columns to write. Writes all columns by default. col_space : %(col_space_type)s, optional - %(col_space)s. + %(col_space)s header : %(header_type)s, optional %(header)s. index : bool, optional, default True @@ -566,7 +565,7 @@ def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceTyp result = {} elif isinstance(col_space, (int, str)): result = {"": col_space} - result.update({column: col_space for column in self.frame.columns}) + result.update(dict.fromkeys(self.frame.columns, col_space)) elif isinstance(col_space, Mapping): for column in col_space.keys(): if column not in self.frame.columns and column != "": @@ -1218,8 +1217,6 @@ def _format(x): return self.na_rep elif isinstance(x, PandasObject): return str(x) - elif isinstance(x, StringDtype): - return repr(x) else: # object dtype return str(formatter(x)) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index c9a6e94a0c7c1..eb579f7149d44 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -249,7 +249,7 @@ Print a concise summary of a {klass}. This method prints information about a {klass} including - the index dtype{type_sub}, non-null values and memory usage. + the index dtype{type_sub}, non-NA values and memory usage. {version_added_sub}\ Parameters diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 482ed316c7ce4..6752c83d5169b 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1225,7 +1225,7 @@ def format( data = self.data.loc[subset] if not isinstance(formatter, dict): - formatter = {col: formatter for col in data.columns} + formatter = dict.fromkeys(data.columns, formatter) cis = self.columns.get_indexer_for(data.columns) ris = self.index.get_indexer_for(data.index) @@ -1411,7 +1411,7 @@ def format_index( return self # clear the formatter / revert to default and avoid looping if not isinstance(formatter, dict): - formatter = {level: formatter for level in levels_} + formatter = dict.fromkeys(levels_, formatter) else: formatter = { obj._get_level_number(level): formatter_ @@ -1708,7 +1708,7 @@ def format_index_names( return self # clear the formatter / revert to default and avoid looping if not isinstance(formatter, dict): - formatter = {level: formatter for level in levels_} + formatter = dict.fromkeys(levels_, formatter) else: formatter = { obj._get_level_number(level): formatter_ diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py new file mode 100644 index 0000000000000..dcb675271031e --- /dev/null +++ b/pandas/io/iceberg.py @@ -0,0 +1,151 @@ +from typing import ( + Any, +) + +from pandas.compat._optional import import_optional_dependency + +from pandas import DataFrame + + +def read_iceberg( + table_identifier: str, + catalog_name: str | None = None, + *, + catalog_properties: dict[str, Any] | None = None, + row_filter: str | None = None, + selected_fields: tuple[str] | None = None, + case_sensitive: bool = True, + snapshot_id: int | None = None, + limit: int | None = None, + scan_properties: dict[str, Any] | None = None, +) -> DataFrame: + """ + Read an Apache Iceberg table into a pandas DataFrame. + + .. versionadded:: 3.0.0 + + .. warning:: + + read_iceberg is experimental and may change without warning. + + Parameters + ---------- + table_identifier : str + Table identifier. + catalog_name : str, optional + The name of the catalog. + catalog_properties : dict of {str: str}, optional + The properties that are used next to the catalog configuration. + row_filter : str, optional + A string that describes the desired rows. + selected_fields : tuple of str, optional + A tuple of strings representing the column names to return in the output + dataframe. + case_sensitive : bool, default True + If True column matching is case sensitive. + snapshot_id : int, optional + Snapshot ID to time travel to. By default the table will be scanned as of the + current snapshot ID. + limit : int, optional + An integer representing the number of rows to return in the scan result. + By default all matching rows will be fetched. + scan_properties : dict of {str: obj}, optional + Additional Table properties as a dictionary of string key value pairs to use + for this scan. + + Returns + ------- + DataFrame + DataFrame based on the Iceberg table. + + See Also + -------- + read_parquet : Read a Parquet file. + + Examples + -------- + >>> df = pd.read_iceberg( + ... table_identifier="my_table", + ... catalog_name="my_catalog", + ... catalog_properties={"s3.secret-access-key": "my-secret"}, + ... row_filter="trip_distance >= 10.0", + ... selected_fields=("VendorID", "tpep_pickup_datetime"), + ... ) # doctest: +SKIP + """ + pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog") + pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions") + if catalog_properties is None: + catalog_properties = {} + catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties) + table = catalog.load_table(table_identifier) + if row_filter is None: + row_filter = pyiceberg_expressions.AlwaysTrue() + if selected_fields is None: + selected_fields = ("*",) + if scan_properties is None: + scan_properties = {} + result = table.scan( + row_filter=row_filter, + selected_fields=selected_fields, + case_sensitive=case_sensitive, + snapshot_id=snapshot_id, + options=scan_properties, + limit=limit, + ) + return result.to_pandas() + + +def to_iceberg( + df: DataFrame, + table_identifier: str, + catalog_name: str | None = None, + *, + catalog_properties: dict[str, Any] | None = None, + location: str | None = None, + append: bool = False, + snapshot_properties: dict[str, str] | None = None, +) -> None: + """ + Write a DataFrame to an Apache Iceberg table. + + .. versionadded:: 3.0.0 + + Parameters + ---------- + table_identifier : str + Table identifier. + catalog_name : str, optional + The name of the catalog. + catalog_properties : dict of {str: str}, optional + The properties that are used next to the catalog configuration. + location : str, optional + Location for the table. + append : bool, default False + If ``True``, append data to the table, instead of replacing the content. + snapshot_properties : dict of {str: str}, optional + Custom properties to be added to the snapshot summary + + See Also + -------- + read_iceberg : Read an Apache Iceberg table. + DataFrame.to_parquet : Write a DataFrame in Parquet format. + """ + pa = import_optional_dependency("pyarrow") + pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog") + if catalog_properties is None: + catalog_properties = {} + catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties) + arrow_table = pa.Table.from_pandas(df) + table = catalog.create_table_if_not_exists( + identifier=table_identifier, + schema=arrow_table.schema, + location=location, + # we could add `partition_spec`, `sort_order` and `properties` in the + # future, but it may not be trivial without exposing PyIceberg objects + ) + if snapshot_properties is None: + snapshot_properties = {} + if append: + table.append(arrow_table, snapshot_properties=snapshot_properties) + else: + table.overwrite(arrow_table, snapshot_properties=snapshot_properties) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 1a2d564d5b44d..02e0ec5247e74 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -218,7 +218,6 @@ def to_orc( if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") - pyarrow = import_optional_dependency(engine, min_version="10.0.1") pa = import_optional_dependency("pyarrow") orc = import_optional_dependency("pyarrow.orc") @@ -229,7 +228,7 @@ def to_orc( with get_handle(path, "wb", is_text=False) as handles: try: orc.write_table( - pyarrow.Table.from_pandas(df, preserve_index=index), + pa.Table.from_pandas(df, preserve_index=index), handles.handle, **engine_kwargs, ) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 818c9f5ff6b80..aa9f3556c8f62 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -258,8 +258,9 @@ def read( ) columns = _filter_usecols(self.usecols, columns) + columns_set = set(columns) - col_dict = {k: v for k, v in col_dict.items() if k in columns} + col_dict = {k: v for k, v in col_dict.items() if k in columns_set} return index, columns, col_dict diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e7b5c7f06a79a..547d8c1fe3d19 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1468,7 +1468,7 @@ def detect_colspecs( shifted[0] = 0 edges = np.where((mask ^ shifted) == 1)[0] edge_pairs = list(zip(edges[::2], edges[1::2])) - return edge_pairs + return edge_pairs # type: ignore[return-value] def __next__(self) -> list[str]: # Argument 1 to "next" has incompatible type "Union[IO[str], diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a689cfbcb1418..c58b4a4be6df1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -39,6 +39,7 @@ ) from pandas._libs.lib import is_string_array from pandas._libs.tslibs import timezones +from pandas.compat import HAS_PYARROW from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle from pandas.errors import ( @@ -381,6 +382,13 @@ def read_hdf( DataFrame.to_hdf : Write a HDF file from a DataFrame. HDFStore : Low-level access to HDF files. + Notes + ----- + When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true, + and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding + to UTF-8, the resulting dtype will be + ``pd.StringDtype(storage="python", na_value=np.nan)``. + Examples -------- >>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"]) # doctest: +SKIP @@ -1760,7 +1768,7 @@ def info(self) -> str: if self.is_open: lkeys = sorted(self.keys()) - if len(lkeys): + if lkeys: keys = [] values = [] @@ -2257,6 +2265,20 @@ def convert( # making an Index instance could throw a number of different errors try: new_pd_index = factory(values, **kwargs) + except UnicodeEncodeError as err: + if ( + errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + new_pd_index = factory( + values, + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise except ValueError: # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') @@ -3170,12 +3192,29 @@ def read_index_node( **kwargs, ) else: - index = factory( - _unconvert_index( - data, kind, encoding=self.encoding, errors=self.errors - ), - **kwargs, - ) + try: + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + **kwargs, + ) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise index.name = name @@ -3311,13 +3350,24 @@ def read( self.validate_read(columns, where) index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) - result = Series(values, index=index, name=self.name, copy=False) - if ( - using_string_dtype() - and isinstance(values, np.ndarray) - and is_string_array(values, skipna=True) - ): - result = result.astype(StringDtype(na_value=np.nan)) + try: + result = Series(values, index=index, name=self.name, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + result = Series( + values, + index=index, + name=self.name, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise return result def write(self, obj, **kwargs) -> None: @@ -4540,7 +4590,7 @@ def write_data(self, chunksize: int | None, dropna: bool = False) -> None: masks.append(mask.astype("u1", copy=False)) # consolidate masks - if len(masks): + if masks: mask = masks[0] for m in masks[1:]: mask = mask & m @@ -4660,7 +4710,7 @@ def delete( groups = list(diff[diff > 1].index) # 1 group - if not len(groups): + if not groups: groups = [0] # final element @@ -4764,7 +4814,24 @@ def read( values = values.reshape((1, values.shape[0])) if isinstance(values, (np.ndarray, DatetimeArray)): - df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + try: + df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + df = DataFrame( + values.T, + columns=cols_, + index=index_, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise elif isinstance(values, Index): df = DataFrame(values, columns=cols_, index=index_) else: @@ -4774,23 +4841,10 @@ def read( assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) # If str / string dtype is stored in meta, use that. - converted = False for column in cols_: dtype = getattr(self.table.attrs, f"{column}_meta", None) if dtype in ["str", "string"]: df[column] = df[column].astype(dtype) - converted = True - # Otherwise try inference. - if ( - not converted - and using_string_dtype() - and isinstance(values, np.ndarray) - and is_string_array( - values, - skipna=True, - ) - ): - df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) if len(frames) == 1: @@ -5224,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd # encode if needed if len(data): data = ( - Series(data.ravel(), copy=False) + Series(data.ravel(), copy=False, dtype="object") .str.encode(encoding, errors) ._values.reshape(data.shape) ) @@ -5264,7 +5318,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - ser = Series(data, copy=False).str.decode(encoding, errors=errors) + ser = Series(data, copy=False).str.decode( + encoding, errors=errors, dtype="object" + ) data = ser.to_numpy() data.flags.writeable = True else: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 0e0f07c0f8ff3..7376843f7e8ff 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1901,7 +1901,7 @@ def prep_table( # Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]"; expected type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]" - dtype = {col_name: dtype for col_name in frame} # type: ignore[misc] + dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type] else: dtype = cast(dict, dtype) @@ -2615,7 +2615,7 @@ def _create_table_setup(self): ] ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index] - if len(ix_cols): + if ix_cols: cnames = "_".join(ix_cols) cnames_br = ",".join([escape(c) for c in ix_cols]) create_stmts.append( @@ -2859,7 +2859,7 @@ def to_sql( # Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]"; expected type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]" - dtype = {col_name: dtype for col_name in frame} # type: ignore[misc] + dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type] else: dtype = cast(dict, dtype) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 34d95fb59a21c..cd290710ddbaa 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -3196,8 +3196,8 @@ def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: for o, (idx, row) in enumerate(selected.iterrows()): for j, (col, v) in enumerate(col_index): val = row[col] - # Allow columns with mixed str and None (GH 23633) - val = "" if val is None else val + # Allow columns with mixed str and None or pd.NA (GH 23633) + val = "" if isna(val) else val key = gso_table.get(val, None) if key is None: # Stata prefers human numbers diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 4c00049075d03..774062e0f0412 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -225,16 +225,20 @@ def __call__(self, x, pos: int | None = 0) -> str: class PeriodConverter(mdates.DateConverter): @staticmethod def convert(values, units, axis): + if not hasattr(axis, "freq"): + raise TypeError("Axis must have `freq` set to convert to Periods") + return PeriodConverter.convert_from_freq(values, axis.freq) + + @staticmethod + def convert_from_freq(values, freq): if is_nested_list_like(values): - values = [PeriodConverter._convert_1d(v, units, axis) for v in values] + values = [PeriodConverter._convert_1d(v, freq) for v in values] else: - values = PeriodConverter._convert_1d(values, units, axis) + values = PeriodConverter._convert_1d(values, freq) return values @staticmethod - def _convert_1d(values, units, axis): - if not hasattr(axis, "freq"): - raise TypeError("Axis must have `freq` set to convert to Periods") + def _convert_1d(values, freq): valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) with warnings.catch_warnings(): warnings.filterwarnings( @@ -248,17 +252,17 @@ def _convert_1d(values, units, axis): or is_integer(values) or is_float(values) ): - return get_datevalue(values, axis.freq) + return get_datevalue(values, freq) elif isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).asi8 + return values.asfreq(freq).asi8 elif isinstance(values, Index): - return values.map(lambda x: get_datevalue(x, axis.freq)) + return values.map(lambda x: get_datevalue(x, freq)) elif lib.infer_dtype(values, skipna=False) == "period": # https://github.com/pandas-dev/pandas/issues/24304 # convert ndarray[period] -> PeriodIndex - return PeriodIndex(values, freq=axis.freq).asi8 + return PeriodIndex(values, freq=freq).asi8 elif isinstance(values, (list, tuple, np.ndarray, Index)): - return [get_datevalue(x, axis.freq) for x in values] + return [get_datevalue(x, freq) for x in values] return values diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 1035150302d2c..1c7e1ab57b2a9 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -55,19 +55,20 @@ from pandas.core.dtypes.missing import isna import pandas.core.common as com -from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib import tools -from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters +from pandas.plotting._matplotlib.converter import ( + PeriodConverter, + register_pandas_matplotlib_converters, +) from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by from pandas.plotting._matplotlib.misc import unpack_single_str_list from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.timeseries import ( - decorate_axes, format_dateaxis, maybe_convert_index, - maybe_resample, + prepare_ts_data, use_dynamic_x, ) from pandas.plotting._matplotlib.tools import ( @@ -288,6 +289,21 @@ def __init__( self.data = self._ensure_frame(self.data) + from pandas.plotting import plot_params + + self.x_compat = plot_params["x_compat"] + if "x_compat" in self.kwds: + self.x_compat = bool(self.kwds.pop("x_compat")) + + @final + def _is_ts_plot(self) -> bool: + # this is slightly deceptive + return not self.x_compat and self.use_index and self._use_dynamic_x() + + @final + def _use_dynamic_x(self) -> bool: + return use_dynamic_x(self._get_ax(0), self.data.index) + @final @staticmethod def _validate_sharex(sharex: bool | None, ax, by) -> bool: @@ -786,7 +802,13 @@ def _adorn_subplots(self, fig: Figure) -> None: if self.title: if self.subplots: if is_list_like(self.title): - if len(self.title) != self.nseries: + if not isinstance(self.subplots, bool): + if len(self.subplots) != len(self.title): + raise ValueError( + f"The number of titles ({len(self.title)}) must equal " + f"the number of subplots ({len(self.subplots)})." + ) + elif len(self.title) != self.nseries: raise ValueError( "The length of `title` must equal the number " "of columns if using `title` of type `list` " @@ -872,10 +894,7 @@ def _make_legend(self) -> None: if leg is not None: title = leg.get_title().get_text() # Replace leg.legend_handles because it misses marker info - if Version(mpl.__version__) < Version("3.7"): - handles = leg.legendHandles - else: - handles = leg.legend_handles + handles = leg.legend_handles labels = [x.get_text() for x in leg.get_texts()] if self.legend: @@ -1213,15 +1232,10 @@ def _get_errorbars( @final def _get_subplots(self, fig: Figure) -> list[Axes]: - if Version(mpl.__version__) < Version("3.8"): - Klass = mpl.axes.Subplot - else: - Klass = mpl.axes.Axes - return [ ax for ax in fig.get_axes() - if (isinstance(ax, Klass) and ax.get_subplotspec() is not None) + if (isinstance(ax, mpl.axes.Axes) and ax.get_subplotspec() is not None) ] @final @@ -1324,10 +1338,20 @@ def __init__( c = self.data.columns[c] self.c = c + @register_pandas_matplotlib_converters def _make_plot(self, fig: Figure) -> None: x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] + from pandas import Series + + x_data = data[x] + s = Series(index=x_data) + if use_dynamic_x(ax, s.index): + s = maybe_convert_index(ax, s) + freq, s = prepare_ts_data(s, ax, self.kwds) + x_data = s.index + c_is_column = is_hashable(c) and c in self.data.columns color_by_categorical = c_is_column and isinstance( @@ -1344,7 +1368,7 @@ def _make_plot(self, fig: Figure) -> None: else: label = None - # if a list of non color strings is passed in as c, color points + # if a list of non-color strings is passed in as c, color points # by uniqueness of the strings, such same strings get same color create_colors = not self._are_valid_colors(c_values) if create_colors: @@ -1360,7 +1384,7 @@ def _make_plot(self, fig: Figure) -> None: ) scatter = ax.scatter( - data[x].values, + x_data.values, data[y].values, c=c_values, label=label, @@ -1520,23 +1544,9 @@ def _kind(self) -> Literal["line", "area", "hist", "kde", "box"]: return "line" def __init__(self, data, **kwargs) -> None: - from pandas.plotting import plot_params - MPLPlot.__init__(self, data, **kwargs) if self.stacked: self.data = self.data.fillna(value=0) - self.x_compat = plot_params["x_compat"] - if "x_compat" in self.kwds: - self.x_compat = bool(self.kwds.pop("x_compat")) - - @final - def _is_ts_plot(self) -> bool: - # this is slightly deceptive - return not self.x_compat and self.use_index and self._use_dynamic_x() - - @final - def _use_dynamic_x(self) -> bool: - return use_dynamic_x(self._get_ax(0), self.data) def _make_plot(self, fig: Figure) -> None: if self._is_ts_plot(): @@ -1626,15 +1636,8 @@ def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds): # accept x to be consistent with normal plot func, # x is not passed to tsplot as it uses data.index as x coordinate # column_num must be in kwds for stacking purpose - freq, data = maybe_resample(data, ax, kwds) + freq, data = prepare_ts_data(data, ax, kwds) - # Set ax with freq info - decorate_axes(ax, freq) - # digging deeper - if hasattr(ax, "left_ax"): - decorate_axes(ax.left_ax, freq) - if hasattr(ax, "right_ax"): - decorate_axes(ax.right_ax, freq) # TODO #54485 ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined] @@ -1855,7 +1858,6 @@ def __init__( self.bar_width = width self._align = align self._position = position - self.tick_pos = np.arange(len(data)) if is_list_like(bottom): bottom = np.array(bottom) @@ -1868,6 +1870,16 @@ def __init__( MPLPlot.__init__(self, data, **kwargs) + if self._is_ts_plot(): + self.tick_pos = np.array( + PeriodConverter.convert_from_freq( + self._get_xticks(), + data.index.freq, + ) + ) + else: + self.tick_pos = np.arange(len(data)) + @cache_readonly def ax_pos(self) -> np.ndarray: return self.tick_pos - self.tickoffset @@ -1897,6 +1909,7 @@ def lim_offset(self): # error: Signature of "_plot" incompatible with supertype "MPLPlot" @classmethod + @register_pandas_matplotlib_converters def _plot( # type: ignore[override] cls, ax: Axes, @@ -1921,6 +1934,21 @@ def _make_plot(self, fig: Figure) -> None: K = self.nseries data = self.data.fillna(0) + + _stacked_subplots_ind: dict[int, int] = {} + _stacked_subplots_offsets = [] + + self.subplots: list[Any] + + if not isinstance(self.subplots, bool): + if bool(self.subplots) and self.stacked: + for i, sub_plot in enumerate(self.subplots): + if len(sub_plot) <= 1: + continue + for plot in sub_plot: + _stacked_subplots_ind[int(plot)] = i + _stacked_subplots_offsets.append([0, 0]) + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() @@ -1946,7 +1974,28 @@ def _make_plot(self, fig: Figure) -> None: start = start + self._start_base kwds["align"] = self._align - if self.subplots: + + if i in _stacked_subplots_ind: + offset_index = _stacked_subplots_ind[i] + pos_prior, neg_prior = _stacked_subplots_offsets[offset_index] # type:ignore[assignment] + mask = y >= 0 + start = np.where(mask, pos_prior, neg_prior) + self._start_base + w = self.bar_width / 2 + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds, + ) + pos_new = pos_prior + np.where(mask, y, 0) + neg_new = neg_prior + np.where(mask, 0, y) + _stacked_subplots_offsets[offset_index] = [pos_new, neg_new] + + elif self.subplots: w = self.bar_width / 2 rect = self._plot( ax, diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 962f9711d9916..7cf63c8621392 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -22,8 +22,6 @@ from pandas.core.dtypes.common import is_list_like -import pandas.core.common as com - if TYPE_CHECKING: from matplotlib.colors import Colormap @@ -251,31 +249,17 @@ def _is_floats_color(color: Color | Collection[Color]) -> bool: def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color]: """Get colors from user input color type.""" if color_type == "default": - return _get_default_colors(num_colors) + prop_cycle = mpl.rcParams["axes.prop_cycle"] + return [ + c["color"] + for c in itertools.islice(prop_cycle, min(num_colors, len(prop_cycle))) + ] elif color_type == "random": - return _get_random_colors(num_colors) + return np.random.default_rng(num_colors).random((num_colors, 3)).tolist() else: raise ValueError("color_type must be either 'default' or 'random'") -def _get_default_colors(num_colors: int) -> list[Color]: - """Get `num_colors` of default colors from matplotlib rc params.""" - colors = [c["color"] for c in mpl.rcParams["axes.prop_cycle"]] - return colors[0:num_colors] - - -def _get_random_colors(num_colors: int) -> list[Color]: - """Get `num_colors` of random colors.""" - return [_random_color(num) for num in range(num_colors)] - - -def _random_color(column: int) -> list[float]: - """Get a random color represented as a list of length 3""" - # GH17525 use common._random_state to avoid resetting the seed - rs = com.random_state(column) - return rs.rand(3).tolist() - - def _is_single_string_color(color: Color) -> bool: """Check if `color` is a single string color. diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index d95ccad2da565..beaf5b6259ef3 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -48,7 +48,6 @@ from pandas._typing import NDFrameT from pandas import ( - DataFrame, DatetimeIndex, Index, PeriodIndex, @@ -231,8 +230,8 @@ def _get_freq(ax: Axes, series: Series): return freq, ax_freq -def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: - freq = _get_index_freq(data.index) +def use_dynamic_x(ax: Axes, index: Index) -> bool: + freq = _get_index_freq(index) ax_freq = _get_ax_freq(ax) if freq is None: # convert irregular if axes has freq info @@ -250,16 +249,15 @@ def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: return False # FIXME: hack this for 0.10.1, creating more technical debt...sigh - if isinstance(data.index, ABCDatetimeIndex): + if isinstance(index, ABCDatetimeIndex): # error: "BaseOffset" has no attribute "_period_dtype_code" freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str) base = to_offset(freq_str, is_period=True)._period_dtype_code # type: ignore[attr-defined] - x = data.index if base <= FreqGroup.FR_DAY.value: - return x[:1].is_normalized - period = Period(x[0], freq_str) + return index[:1].is_normalized + period = Period(index[0], freq_str) assert isinstance(period, Period) - return period.to_timestamp().tz_localize(x.tz) == x[0] + return period.to_timestamp().tz_localize(index.tz) == index[0] return True @@ -366,3 +364,19 @@ def format_dateaxis( raise TypeError("index type not supported") plt.draw_if_interactive() + + +def prepare_ts_data( + series: Series, ax: Axes, kwargs: dict[str, Any] +) -> tuple[BaseOffset | str, Series]: + freq, data = maybe_resample(series, ax, kwargs) + + # Set ax with freq info + decorate_axes(ax, freq) + # digging deeper + if hasattr(ax, "left_ax"): + decorate_axes(ax.left_ax, freq) + if hasattr(ax, "right_ax"): + decorate_axes(ax.right_ax, freq) + + return freq, data diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 0e0fb23d924bc..0f2d824f37ffc 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -68,7 +68,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> fig, ax = plt.subplots() >>> ax.axis("off") - (0.0, 1.0, 0.0, 1.0) + (np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(1.0)) >>> table = pd.plotting.table( ... ax, df, loc="center", cellLoc="center", colWidths=[0.2, 0.2] ... ) @@ -412,7 +412,7 @@ def andrews_curves( >>> df = pd.read_csv( ... "https://raw.githubusercontent.com/pandas-dev/" ... "pandas/main/pandas/tests/io/data/csv/iris.csv" - ... ) + ... ) # doctest: +SKIP >>> pd.plotting.andrews_curves(df, "Name") # doctest: +SKIP """ plot_backend = _get_plot_backend("matplotlib") @@ -551,7 +551,7 @@ def parallel_coordinates( >>> df = pd.read_csv( ... "https://raw.githubusercontent.com/pandas-dev/" ... "pandas/main/pandas/tests/io/data/csv/iris.csv" - ... ) + ... ) # doctest: +SKIP >>> pd.plotting.parallel_coordinates( ... df, "Name", color=("#556270", "#4ECDC4", "#C7F464") ... ) # doctest: +SKIP @@ -633,6 +633,15 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax """ Autocorrelation plot for time series. + This method generates an autocorrelation plot for a given time series, + which helps to identify any periodic structure or correlation within the + data across various lags. It shows the correlation of a time series with a + delayed copy of itself as a function of delay. Autocorrelation plots are useful for + checking randomness in a data set. If the data are random, the autocorrelations + should be near zero for any and all time-lag separations. If the data are not + random, then one or more of the autocorrelations will be significantly + non-zero. + Parameters ---------- series : Series diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 2ba90948be399..871e977cbe2f8 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -168,6 +168,7 @@ class TestPDApi(Base): "read_parquet", "read_orc", "read_spss", + "read_iceberg", ] # top-level json funcs diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py new file mode 100644 index 0000000000000..aecf82f5a9419 --- /dev/null +++ b/pandas/tests/apply/conftest.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +from pandas.api.executors import BaseExecutionEngine + + +class MockExecutionEngine(BaseExecutionEngine): + """ + Execution Engine to test if the execution engine interface receives and + uses all parameters provided by the user. + + Making this engine work as the default Python engine by calling it, no extra + functionality is implemented here. + + When testing, this will be called when this engine is provided, and then the + same pandas.map and pandas.apply function will be called, but without engine, + executing the default behavior from the python engine. + """ + + def map(data, func, args, kwargs, decorator, skip_na): + kwargs_to_pass = kwargs if isinstance(data, DataFrame) else {} + return data.map(func, na_action="ignore" if skip_na else None, **kwargs_to_pass) + + def apply(data, func, args, kwargs, decorator, axis): + if isinstance(data, Series): + return data.apply(func, convert_dtype=True, args=args, by_row=False) + elif isinstance(data, DataFrame): + return data.apply( + func, + axis=axis, + raw=False, + result_type=None, + args=args, + by_row="compat", + **kwargs, + ) + else: + assert isinstance(data, np.ndarray) + + def wrap_function(func): + # https://github.com/numpy/numpy/issues/8352 + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + if isinstance(result, str): + result = np.array(result, dtype=object) + return result + + return wrapper + + return np.apply_along_axis(wrap_function(func), axis, data, *args, **kwargs) + + +class MockEngineDecorator: + __pandas_udf__ = MockExecutionEngine + + +@pytest.fixture(params=[None, MockEngineDecorator]) +def engine(request): + return request.param diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 2d47cd851ad10..a9afb5dbd11d7 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -17,63 +17,11 @@ date_range, ) import pandas._testing as tm -from pandas.api.executors import BaseExecutionEngine +from pandas.tests.apply.conftest import MockEngineDecorator from pandas.tests.frame.common import zip_frames from pandas.util.version import Version -class MockExecutionEngine(BaseExecutionEngine): - """ - Execution Engine to test if the execution engine interface receives and - uses all parameters provided by the user. - - Making this engine work as the default Python engine by calling it, no extra - functionality is implemented here. - - When testing, this will be called when this engine is provided, and then the - same pandas.map and pandas.apply function will be called, but without engine, - executing the default behavior from the python engine. - """ - - def map(data, func, args, kwargs, decorator, skip_na): - kwargs_to_pass = kwargs if isinstance(data, DataFrame) else {} - return data.map( - func, action_na="ignore" if skip_na else False, **kwargs_to_pass - ) - - def apply(data, func, args, kwargs, decorator, axis): - if isinstance(data, Series): - return data.apply(func, convert_dtype=True, args=args, by_row=False) - elif isinstance(data, DataFrame): - return data.apply( - func, - axis=axis, - raw=False, - result_type=None, - args=args, - by_row="compat", - **kwargs, - ) - else: - assert isinstance(data, np.ndarray) - - def wrap_function(func): - # https://github.com/numpy/numpy/issues/8352 - def wrapper(*args, **kwargs): - result = func(*args, **kwargs) - if isinstance(result, str): - result = np.array(result, dtype=object) - return result - - return wrapper - - return np.apply_along_axis(wrap_function(func), axis, data, *args, **kwargs) - - -class MockEngineDecorator: - __pandas_udf__ = MockExecutionEngine - - @pytest.fixture def int_frame_const_col(): """ @@ -334,7 +282,7 @@ def test_apply_broadcast_scalars(float_frame): def test_apply_broadcast_scalars_axis1(float_frame): result = float_frame.apply(np.mean, axis=1, result_type="broadcast") m = float_frame.mean(axis=1) - expected = DataFrame({c: m for c in float_frame.columns}) + expected = DataFrame(dict.fromkeys(float_frame.columns, m)) tm.assert_frame_equal(result, expected) @@ -361,7 +309,7 @@ def test_apply_broadcast_lists_index(float_frame): ) m = list(range(len(float_frame.index))) expected = DataFrame( - {c: m for c in float_frame.columns}, + dict.fromkeys(float_frame.columns, m), dtype="float64", index=float_frame.index, ) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 9541b0b7495c7..896c5c5fca9f7 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -376,13 +376,13 @@ def test_demo(): @pytest.mark.parametrize("func", [str, lambda x: str(x)]) -def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row): +def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row, engine): # test that we are evaluating row-by-row first if by_row="compat" # else vectorized evaluation result = string_series.apply(func, by_row=by_row) if by_row: - expected = string_series.map(func) + expected = string_series.map(func, engine=engine) tm.assert_series_equal(result, expected) else: assert result == str(string_series) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index dee3deeee0f2f..9fbea2022c87b 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -178,25 +178,9 @@ def test_error_invalid_values(data, all_arithmetic_operators): ops = getattr(s, op) # invalid scalars - msg = "|".join( - [ - r"can only perform ops with numeric values", - r"IntegerArray cannot perform the operation mod", - r"unsupported operand type", - r"can only concatenate str \(not \"int\"\) to str", - "not all arguments converted during string", - "ufunc '.*' not supported for the input types, and the inputs could not", - "ufunc '.*' did not contain a loop with signature matching types", - "Addition/subtraction of integers and integer-arrays with Timestamp", - "has no kernel", - "not implemented", - "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", - "not supported for dtype", - ] - ) - with pytest.raises(TypeError, match=msg): + with tm.external_error_raised(TypeError): ops("foo") - with pytest.raises(TypeError, match=msg): + with tm.external_error_raised(TypeError): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -214,25 +198,10 @@ def test_error_invalid_values(data, all_arithmetic_operators): # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(TypeError, match=msg): + with tm.external_error_raised(TypeError): ops(str_ser) - msg = "|".join( - [ - "can only perform ops with numeric values", - "cannot perform .* with this index type: DatetimeArray", - "Addition/subtraction of integers and integer-arrays " - "with DatetimeArray is no longer supported. *", - "unsupported operand type", - r"can only concatenate str \(not \"int\"\) to str", - "not all arguments converted during string", - "cannot subtract DatetimeArray from ndarray", - "has no kernel", - "not implemented", - "not supported for dtype", - ] - ) - with pytest.raises(TypeError, match=msg): + with tm.external_error_raised(TypeError): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 336a0fef69170..736c0e1782fc0 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -10,10 +10,12 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import ( pa_version_under12p0, pa_version_under19p0, ) +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_dtype_equal @@ -45,6 +47,25 @@ def cls(dtype): return dtype.construct_array_type() +def string_dtype_highest_priority(dtype1, dtype2): + if HAS_PYARROW: + DTYPE_HIERARCHY = [ + pd.StringDtype("python", na_value=np.nan), + pd.StringDtype("pyarrow", na_value=np.nan), + pd.StringDtype("python", na_value=pd.NA), + pd.StringDtype("pyarrow", na_value=pd.NA), + ] + else: + DTYPE_HIERARCHY = [ + pd.StringDtype("python", na_value=np.nan), + pd.StringDtype("python", na_value=pd.NA), + ] + + h1 = DTYPE_HIERARCHY.index(dtype1) + h2 = DTYPE_HIERARCHY.index(dtype2) + return DTYPE_HIERARCHY[max(h1, h2)] + + def test_dtype_constructor(): pytest.importorskip("pyarrow") @@ -103,6 +124,18 @@ def test_repr(dtype): assert repr(df.A.array) == expected +def test_dtype_repr(dtype): + if dtype.storage == "pyarrow": + if dtype.na_value is pd.NA: + assert repr(dtype) == ")>" + else: + assert repr(dtype) == "" + elif dtype.na_value is pd.NA: + assert repr(dtype) == ")>" + else: + assert repr(dtype) == "" + + def test_none_to_nan(cls, dtype): a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None @@ -319,13 +352,18 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_array(comparison_op, dtype): +def test_comparison_methods_array(comparison_op, dtype, dtype2): op_name = f"__{comparison_op.__name__}__" a = pd.array(["a", None, "c"], dtype=dtype) - other = [None, None, "c"] - result = getattr(a, op_name)(other) - if dtype.na_value is np.nan: + other = pd.array([None, None, "c"], dtype=dtype2) + result = comparison_op(a, other) + + # ensure operation is commutative + result2 = comparison_op(other, a) + tm.assert_equal(result, result2) + + if dtype.na_value is np.nan and dtype2.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -333,11 +371,56 @@ def test_comparison_methods_array(comparison_op, dtype): expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) + else: + max_dtype = string_dtype_highest_priority(dtype, dtype2) + if max_dtype.storage == "python": + expected_dtype = "boolean" + else: + expected_dtype = "bool[pyarrow]" + + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_comparison_methods_array_arrow_extension(comparison_op, dtype2): + # Test pd.ArrowDtype(pa.string()) against other string arrays + import pyarrow as pa + + op_name = f"__{comparison_op.__name__}__" + dtype = pd.ArrowDtype(pa.string()) + a = pd.array(["a", None, "c"], dtype=dtype) + other = pd.array([None, None, "c"], dtype=dtype2) + result = comparison_op(a, other) + + # ensure operation is commutative + result2 = comparison_op(other, a) + tm.assert_equal(result, result2) + + expected = pd.array([None, None, True], dtype="bool[pyarrow]") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_list(comparison_op, dtype): + op_name = f"__{comparison_op.__name__}__" + + a = pd.array(["a", None, "c"], dtype=dtype) + other = [None, None, "c"] + result = comparison_op(a, other) + + # ensure operation is commutative + result2 = comparison_op(other, a) + tm.assert_equal(result, result2) + + if dtype.na_value is np.nan: if operator.ne == comparison_op: - expected = np.array([True, True, True]) + expected = np.array([True, True, False]) else: expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) else: @@ -347,10 +430,6 @@ def test_comparison_methods_array(comparison_op, dtype): expected = pd.array(expected, dtype=expected_dtype) tm.assert_extension_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) - def test_constructor_raises(cls): if cls is pd.arrays.StringArray: diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 3c0bf6c35866c..9fed65faee896 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2006,3 +2006,24 @@ def test_eval_float_div_numexpr(): result = pd.eval("1 / 2", engine="numexpr") expected = 0.5 assert result == expected + + +def test_method_calls_on_binop(): + # GH 61175 + x = Series([1, 2, 3, 5]) + y = Series([2, 3, 4]) + + # Method call on binary operation result + result = pd.eval("(x + y).dropna()") + expected = (x + y).dropna() + tm.assert_series_equal(result, expected) + + # Test with other binary operations + result = pd.eval("(x * y).dropna()") + expected = (x * y).dropna() + tm.assert_series_equal(result, expected) + + # Test with method chaining + result = pd.eval("(x + y).dropna().reset_index(drop=True)") + expected = (x + y).dropna().reset_index(drop=True) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py index aaf6178866ecd..a6bc40469cada 100644 --- a/pandas/tests/config/test_config.py +++ b/pandas/tests/config/test_config.py @@ -195,6 +195,24 @@ def test_set_option_multiple(self): assert cf.get_option("b.c") is None assert cf.get_option("b.b") == 10.0 + def test_set_option_dict(self): + # GH 61093 + + cf.register_option("a", 1, "doc") + cf.register_option("b.c", "hullo", "doc2") + cf.register_option("b.b", None, "doc2") + + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "hullo" + assert cf.get_option("b.b") is None + + options_dict = {"a": "2", "b.c": None, "b.b": 10.0} + cf.set_option(options_dict) + + assert cf.get_option("a") == "2" + assert cf.get_option("b.c") is None + assert cf.get_option("b.b") == 10.0 + def test_validation(self): cf.register_option("a", 1, "doc", validator=cf.is_int) cf.register_option("d", 1, "doc", validator=cf.is_nonnegative_int) @@ -377,6 +395,33 @@ def f(): f() + def test_set_ContextManager_dict(self): + def eq(val): + assert cf.get_option("a") == val + assert cf.get_option("b.c") == val + + cf.register_option("a", 0) + cf.register_option("b.c", 0) + + eq(0) + with cf.option_context({"a": 15, "b.c": 15}): + eq(15) + with cf.option_context({"a": 25, "b.c": 25}): + eq(25) + eq(15) + eq(0) + + cf.set_option("a", 17) + cf.set_option("b.c", 17) + eq(17) + + # Test that option_context can be used as a decorator too + @cf.option_context({"a": 123, "b.c": 123}) + def f(): + eq(123) + + f() + def test_attribute_access(self): holder = [] diff --git a/pandas/tests/dtypes/cast/test_maybe_box_native.py b/pandas/tests/dtypes/cast/test_maybe_box_native.py index 3f62f31dac219..151586962d517 100644 --- a/pandas/tests/dtypes/cast/test_maybe_box_native.py +++ b/pandas/tests/dtypes/cast/test_maybe_box_native.py @@ -17,7 +17,7 @@ "obj,expected_dtype", [ (b"\x00\x10", bytes), - (int(4), int), + ((4), int), (np.uint(4), int), (np.int32(-4), int), (np.uint8(4), int), diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index db98751324ebc..7fd0395009adb 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -250,6 +250,15 @@ class MyDataFrame(DataFrame, Generic[T]): ... assert inference.is_list_like(tst) +def test_is_list_like_native_container_types(): + # GH 61565 + # is_list_like was yielding false positives for native container types + assert not inference.is_list_like(list[int]) + assert not inference.is_list_like(list[str]) + assert not inference.is_list_like(tuple[int]) + assert not inference.is_list_like(tuple[str]) + + def test_is_sequence(): is_seq = inference.is_sequence assert is_seq((1, 2)) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index c61cda83cf6e0..a5b22ac30d820 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -769,8 +769,8 @@ def test_empty_like(self): np.datetime64("NaT"), np.timedelta64("NaT"), ] - + [np.datetime64("NaT", unit) for unit in m8_units] - + [np.timedelta64("NaT", unit) for unit in m8_units] + + [np.datetime64("NaT", unit) for unit in m8_units] # type: ignore[call-overload] + + [np.timedelta64("NaT", unit) for unit in m8_units] # type: ignore[call-overload] ) inf_vals = [ diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 2915c0585f373..a760cbc3995b3 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import NumpyEADtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import ExtensionArray @@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack): expected = expected.astype(object) if isinstance(expected, pd.Series): - assert result.dtype == df.iloc[:, 0].dtype + if future_stack and isinstance(data.dtype, NumpyEADtype): + # GH#58817 future_stack=True constructs the result specifying the dtype + # using the dtype of the input; we thus get the underlying + # NumPy dtype as the result instead of the NumpyExtensionArray + assert result.dtype == df.iloc[:, 0].to_numpy().dtype + else: + assert result.dtype == df.iloc[:, 0].dtype else: assert all(result.dtypes == df.iloc[:, 0].dtype) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 1d613ced2c03f..185d6d750cace 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -440,11 +440,11 @@ def test_delitem_series(self, data): tm.assert_series_equal(ser, expected) def test_setitem_invalid(self, data, invalid_scalar): - msg = "" # messages vary by subclass, so we do not test it - with pytest.raises((ValueError, TypeError), match=msg): + # messages vary by subclass, so we do not test it + with pytest.raises((ValueError, TypeError), match=None): data[0] = invalid_scalar - with pytest.raises((ValueError, TypeError), match=msg): + with pytest.raises((ValueError, TypeError), match=None): data[:] = invalid_scalar def test_setitem_2d_values(self, data): diff --git a/pandas/tests/extension/date/array.py b/pandas/tests/extension/date/array.py index 2306f5974ba18..0c51570189a7c 100644 --- a/pandas/tests/extension/date/array.py +++ b/pandas/tests/extension/date/array.py @@ -113,7 +113,7 @@ def __init__( # error: "object_" object is not iterable obj = np.char.split(dates, sep="-") - for (i,), (y, m, d) in np.ndenumerate(obj): # type: ignore[misc] + for (i,), (y, m, d) in np.ndenumerate(obj): self._year[i] = int(y) self._month[i] = int(m) self._day[i] = int(d) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 25129111180d6..96c014f549056 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -31,6 +31,7 @@ from pandas.api.types import is_string_dtype from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype +from pandas.tests.arrays.string_.test_string import string_dtype_highest_priority from pandas.tests.extension import base @@ -202,10 +203,13 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: cast_to = dtype + dtype_other = tm.get_dtype(other) if not isinstance(other, str) else None + if isinstance(dtype_other, StringDtype): + cast_to = string_dtype_highest_priority(dtype, dtype_other) elif dtype.na_value is np.nan: cast_to = np.bool_ # type: ignore[assignment] elif dtype.storage == "pyarrow": - cast_to = "boolean[pyarrow]" # type: ignore[assignment] + cast_to = "bool[pyarrow]" # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) @@ -236,10 +240,10 @@ def test_arith_series_with_array( if ( using_infer_string and all_arithmetic_operators == "__radd__" - and ( - (dtype.na_value is pd.NA) or (dtype.storage == "python" and HAS_PYARROW) - ) + and dtype.na_value is pd.NA + and (HAS_PYARROW or dtype.storage == "pyarrow") ): + # TODO(infer_string) mark = pytest.mark.xfail( reason="The pointwise operation result will be inferred to " "string[nan, pyarrow], which does not match the input dtype" diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 6dfbc325aafa4..304638a3a7dcf 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -497,3 +497,13 @@ def test_corr_within_bounds(self): corr_matrix = df2.corr() assert corr_matrix.min().min() >= -1.0 assert corr_matrix.max().max() <= 1.0 + + def test_cov_with_missing_values(self): + df = DataFrame({"A": [1, 2, None, 4], "B": [2, 4, None, 9]}) + expected = DataFrame( + {"A": [2.333333, 5.500000], "B": [5.5, 13.0]}, index=["A", "B"] + ) + result1 = df.cov() + result2 = df.dropna().cov() + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) diff --git a/pandas/tests/frame/methods/test_dot.py b/pandas/tests/frame/methods/test_dot.py index 3e01f67c8794b..b365ceb2ab61c 100644 --- a/pandas/tests/frame/methods/test_dot.py +++ b/pandas/tests/frame/methods/test_dot.py @@ -153,3 +153,19 @@ def test_arrow_dtype(dtype, exp_dtype): expected = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=exp_dtype) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype,exp_dtype", + [("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")], +) +def test_arrow_dtype_series(dtype, exp_dtype): + pytest.importorskip("pyarrow") + + cols = ["a", "b"] + series_a = Series([1, 2], index=cols, dtype="int32") + df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype) + result = series_a.dot(df_b) + expected = Series([1, 2], dtype=exp_dtype) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 67d1d45af1cb3..8915d6f205d65 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import OutOfBoundsDatetime + from pandas import ( Categorical, DataFrame, @@ -781,3 +783,15 @@ def test_fillna_with_none_object(test_frame, dtype): if test_frame: expected = expected.to_frame() tm.assert_equal(result, expected) + + +def test_fillna_out_of_bounds_datetime(): + # GH#61208 + df = DataFrame( + {"datetime": date_range("1/1/2011", periods=3, freq="h"), "value": [1, 2, 3]} + ) + df.iloc[0, 0] = None + + msg = "Cannot cast 0001-01-01 00:00:00 to unit='ns' without overflow" + with pytest.raises(OutOfBoundsDatetime, match=msg): + df.fillna(Timestamp("0001-01-01")) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index c6e5304ae3cb4..08b7128e6ec11 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request): index=[0, 0, 1, 1, 1], ) result = df.nsmallest(n, order) - expected = df.sort_values(order).head(n) + expected = df.sort_values(order, kind="stable").head(n) tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False).head(n) + expected = df.sort_values(order, ascending=False, kind="stable").head(n) if Version(np.__version__) >= Version("1.25") and ( (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ): diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index e728526519e9d..9a628c2ee9f73 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -630,6 +630,13 @@ def test_sort_values_no_op_reset_index(self): expected = DataFrame({"A": [10, 20], "B": [1, 5]}) tm.assert_frame_equal(result, expected) + def test_sort_by_column_named_none(self): + # GH#61512 + df = DataFrame([[3, 1], [2, 2]], columns=[None, "C1"]) + result = df.sort_values(by=None) + expected = DataFrame([[2, 2], [3, 1]], columns=[None, "C1"], index=[1, 0]) + tm.assert_frame_equal(result, expected) + class TestDataFrameSortKey: # test key sorting (issue 27237) def test_sort_values_inplace_key(self, sort_by_key): diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 9eafc69013ffe..34d120145b381 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1450,3 +1450,22 @@ def test_to_csv_warn_when_zip_tar_and_append_mode(self, tmp_path): RuntimeWarning, match=msg, raise_on_extra_warnings=False ): df.to_csv(tar_path, mode="a") + + def test_to_csv_escape_quotechar(self): + # GH61514 + df = DataFrame( + { + "col_a": ["a", "a2"], + "col_b": ['b"c', None], + "col_c": ['de,f"', '"c'], + } + ) + + result = df.to_csv(quotechar='"', escapechar="\\", quoting=csv.QUOTE_NONE) + expected_rows = [ + ",col_a,col_b,col_c", + '0,a,b\\"c,de\\,f\\"', + '1,a2,,\\"c', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index 36088cceb13f1..f68d7f533645d 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -3,7 +3,9 @@ from pandas import ( DataFrame, + NaT, Timestamp, + date_range, ) import pandas._testing as tm @@ -41,3 +43,37 @@ def test_to_numpy_mixed_dtype_to_str(self): result = df.to_numpy(dtype=str) expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) tm.assert_numpy_array_equal(result, expected) + + def test_to_numpy_datetime_with_na(self): + # GH #53115 + dti = date_range("2016-01-01", periods=3) + df = DataFrame(dti) + df.iloc[0, 0] = NaT + expected = np.array([[np.nan], [1.45169280e18], [1.45177920e18]]) + result = df.to_numpy(float, na_value=np.nan) + tm.assert_numpy_array_equal(result, expected) + + df = DataFrame( + { + "a": [Timestamp("1970-01-01"), Timestamp("1970-01-02"), NaT], + "b": [ + Timestamp("1970-01-01"), + np.nan, + Timestamp("1970-01-02"), + ], + "c": [ + 1, + np.nan, + 2, + ], + } + ) + expected = np.array( + [ + [0.00e00, 0.00e00, 1.00e00], + [8.64e04, np.nan, np.nan], + [np.nan, 8.64e04, 2.00e00], + ] + ) + result = df.to_numpy(float, na_value=np.nan) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 037a2ae294bb2..2426c89dbcff5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2780,6 +2780,19 @@ def test_construction_nan_value_timedelta64_dtype(self): ) tm.assert_frame_equal(result, expected) + def test_dataframe_from_array_like_with_name_attribute(self): + # GH#61443 + class DummyArray(np.ndarray): + def __new__(cls, input_array): + obj = np.asarray(input_array).view(cls) + obj.name = "foo" + return obj + + dummy = DummyArray(np.eye(3)) + df = DataFrame(dummy) + expected = DataFrame(np.eye(3)) + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/frame/test_dropna_mixedtypes.py b/pandas/tests/frame/test_dropna_mixedtypes.py new file mode 100644 index 0000000000000..565e0ab988538 --- /dev/null +++ b/pandas/tests/frame/test_dropna_mixedtypes.py @@ -0,0 +1,44 @@ +import pandas as pd +import pytest + +# ✅ Existing test for mixed types error +def test_dropna_mixed_types_error(): + df = pd.DataFrame({"A": [1, "two", None]}) # Mixed types in one column + with pytest.raises(TypeError, match="dropna.*uniform column types"): + df.dropna() + +# ✅ Safe Test 1: dropna on uniform types, default settings +def test_dropna_uniform_column(): + df = pd.DataFrame({"A": [1, 2, None], "B": [3, 4, 5]}) + result = df.dropna() + expected = pd.DataFrame({"A": [1.0, 2.0], "B": [3, 4]}, index=[0, 1]) + pd.testing.assert_frame_equal(result, expected) + +# ✅ Safe Test 2: dropna(axis=1) to drop all-NaN columns +def test_dropna_axis1(): + df = pd.DataFrame({ + "A": [1, 2, 3], + "B": [None, None, None], + "C": [4, 5, 6] + }) + result = df.dropna(axis=1) + expected = pd.DataFrame({ + "A": [1, 2, 3], + "C": [4, 5, 6] + }) + pd.testing.assert_frame_equal(result, expected) + +# ✅ Safe Test 3: dropna with subset on clean data +def test_dropna_with_subset(): + df = pd.DataFrame({ + "A": [1, None, 3], + "B": [4, 5, 6], + "C": [None, None, 9] + }) + result = df.dropna(subset=["A"]) + expected = pd.DataFrame({ + "A": [1.0, 3.0], + "B": [4, 6], + "C": [None, 9] + }, index=[0, 2]) + pd.testing.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 375b9b00a4988..f93105498ac79 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1345,6 +1345,11 @@ def test_start_with_spaces(self, df): expect = df[" A"] + df[" "] tm.assert_series_equal(res, expect) + def test_ints(self, df): + res = df.query("`1` == 7") + expect = df[df[1] == 7] + tm.assert_frame_equal(res, expect) + def test_lots_of_operators_string(self, df): res = df.query("` &^ :!€$?(} > <++*'' ` > 4") expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 127f0fc50a747..cc23c292b66dc 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1917,6 +1917,39 @@ def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): expected = Series([pd.NA, pd.NA], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data", + [ + {"a": [0, 1, 2], "b": [pd.NaT, pd.NaT, pd.NaT]}, + {"a": [0, 1, 2], "b": [Timestamp("1990-01-01"), pd.NaT, pd.NaT]}, + { + "a": [0, 1, 2], + "b": [ + Timestamp("1990-01-01"), + Timestamp("1991-01-01"), + Timestamp("1992-01-01"), + ], + }, + { + "a": [0, 1, 2], + "b": [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.NaT], + }, + { + "a": [0, 1, 2], + "b": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + }, + ], + ) + def test_df_cov_pd_nat(self, data): + # GH #53115 + df = DataFrame(data) + with pytest.raises(TypeError, match="not supported for cov"): + df.cov() + def test_sum_timedelta64_skipna_false(): # GH#17235 diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index a88090b00499d..4b841b54c488b 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -427,53 +427,6 @@ def test_binops(request, args, annotate, all_binary_operators): if annotate == "right" and isinstance(right, int): pytest.skip("right is an int and doesn't support .attrs") - if not (isinstance(left, int) or isinstance(right, int)) and annotate != "both": - if not all_binary_operators.__name__.startswith("r"): - if annotate == "right" and isinstance(left, type(right)): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when right has " - f"attrs and both are {type(left)}" - ) - ) - if not isinstance(left, type(right)): - if annotate == "left" and isinstance(left, pd.Series): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when the " - "objects are different Series has attrs" - ) - ) - elif annotate == "right" and isinstance(right, pd.Series): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when the " - "objects are different Series has attrs" - ) - ) - else: - if annotate == "left" and isinstance(left, type(right)): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when left has " - f"attrs and both are {type(left)}" - ) - ) - if not isinstance(left, type(right)): - if annotate == "right" and isinstance(right, pd.Series): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when the " - "objects are different Series has attrs" - ) - ) - elif annotate == "left" and isinstance(left, pd.Series): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when the " - "objects are different Series has attrs" - ) - ) if annotate in {"left", "both"} and not isinstance(left, int): left.attrs = {"a": 1} if annotate in {"right", "both"} and not isinstance(right, int): @@ -497,6 +450,18 @@ def test_binops(request, args, annotate, all_binary_operators): assert result.attrs == {"a": 1} +@pytest.mark.parametrize("left", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("right", [pd.Series, pd.DataFrame]) +def test_attrs_binary_operations(all_binary_operators, left, right): + # GH 51607 + attrs = {"a": 1} + left = left([1]) + left.attrs = attrs + right = right([2]) + assert all_binary_operators(left, right).attrs == attrs + assert all_binary_operators(right, left).attrs == attrs + + # ---------------------------------------------------------------------------- # Accessors diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index 9fe9bca8abdc9..5f805ab37c396 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -6,11 +6,13 @@ DataFrame, MultiIndex, Series, + StringDtype, date_range, ) import pandas._testing as tm +from pandas.util.version import Version -pytest.importorskip("xarray") +xarray = pytest.importorskip("xarray") class TestDataFrameToXArray: @@ -29,13 +31,17 @@ def df(self): } ) - def test_to_xarray_index_types(self, index_flat, df, using_infer_string): + def test_to_xarray_index_types(self, index_flat, df, request): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: pytest.skip("Test doesn't make sense for empty index") - - from xarray import Dataset + elif Version(xarray.__version__) <= Version("2024.9.0"): + request.applymarker( + pytest.mark.xfail( + reason="Categorical column not preserved.", + ) + ) df.index = index[:4] df.index.name = "foo" @@ -45,29 +51,22 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): assert len(result.coords) == 1 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, Dataset) + assert isinstance(result, xarray.Dataset) # idempotency # datetimes w/tz are preserved # column names are lost expected = df.copy() - expected["f"] = expected["f"].astype( - object if not using_infer_string else "str" - ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) def test_to_xarray_empty(self, df): - from xarray import Dataset - df.index.name = "foo" result = df[0:0].to_xarray() assert result.sizes["foo"] == 0 - assert isinstance(result, Dataset) + assert isinstance(result, xarray.Dataset) def test_to_xarray_with_multiindex(self, df, using_infer_string): - from xarray import Dataset - # MultiIndex df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"]) result = df.to_xarray() @@ -76,7 +75,7 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): assert len(result.coords) == 2 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) - assert isinstance(result, Dataset) + assert isinstance(result, xarray.Dataset) result = result.to_dataframe() expected = df.copy() @@ -88,12 +87,22 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): class TestSeriesToXArray: - def test_to_xarray_index_types(self, index_flat): + def test_to_xarray_index_types(self, index_flat, request): index = index_flat + if ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + and Version(xarray.__version__) > Version("2024.9.0") + and Version(xarray.__version__) < Version("2025.6.0") + ): + request.applymarker( + pytest.mark.xfail( + reason="xarray calling reshape of ArrowExtensionArray", + raises=NotImplementedError, + ) + ) # MultiIndex is tested in test_to_xarray_with_multiindex - from xarray import DataArray - ser = Series(range(len(index)), index=index, dtype="int64") ser.index.name = "foo" result = ser.to_xarray() @@ -101,30 +110,26 @@ def test_to_xarray_index_types(self, index_flat): assert len(result) == len(index) assert len(result.coords) == 1 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, DataArray) + assert isinstance(result, xarray.DataArray) # idempotency tm.assert_series_equal(result.to_series(), ser) def test_to_xarray_empty(self): - from xarray import DataArray - ser = Series([], dtype=object) ser.index.name = "foo" result = ser.to_xarray() assert len(result) == 0 assert len(result.coords) == 1 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, DataArray) + assert isinstance(result, xarray.DataArray) def test_to_xarray_with_multiindex(self): - from xarray import DataArray - mi = MultiIndex.from_product([["a", "b"], range(3)], names=["one", "two"]) ser = Series(range(6), dtype="int64", index=mi) result = ser.to_xarray() assert len(result) == 2 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) - assert isinstance(result, DataArray) + assert isinstance(result, xarray.DataArray) res = result.to_series() tm.assert_series_equal(res, ser) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b7e6e55739c17..4f6c27bd327cb 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1807,3 +1807,20 @@ def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): index=Index(["level1.1", "level1.2"]), ) tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregate_empty_builtin_sum(): + df = DataFrame(columns=["Group", "Data"]) + result = df.groupby(["Group"], as_index=False)["Data"].agg("sum") + expected = DataFrame(columns=["Group", "Data"]) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregate_empty_udf(): + def func(x): + return sum(x) + + df = DataFrame(columns=["Group", "Data"]) + result = df.groupby(["Group"], as_index=False)["Data"].agg(func) + expected = DataFrame(columns=["Group", "Data"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e49be8c00b426..cae3013642739 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -506,6 +506,23 @@ def test_observed_groups(observed): tm.assert_dict_equal(result, expected) +def test_groups_na_category(dropna, observed): + # https://github.com/pandas-dev/pandas/issues/61356 + df = DataFrame( + {"cat": Categorical(["a", np.nan, "a"], categories=list("adb"))}, + index=list("xyz"), + ) + g = df.groupby("cat", observed=observed, dropna=dropna) + + result = g.groups + expected = {"a": Index(["x", "z"])} + if not dropna: + expected |= {np.nan: Index(["y"])} + if not observed: + expected |= {"b": Index([]), "d": Index([])} + tm.assert_dict_equal(result, expected) + + @pytest.mark.parametrize( "keys, expected_values, expected_index_levels", [ diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 864b9e5d55991..0012074b9f995 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -671,7 +671,7 @@ def test_groupby_raises_category_on_category( "nunique": (None, ""), "pct_change": (TypeError, "unsupported operand type"), "prod": (TypeError, "category type does not support prod operations"), - "quantile": (TypeError, ""), + "quantile": (TypeError, "No matching signature found"), "rank": (None, ""), "sem": ( TypeError, diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 45047fe004aa0..014558bbf4bba 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -20,6 +20,7 @@ isna, ) import pandas._testing as tm +from pandas.tests.groupby import get_groupby_method_args from pandas.util import _test_decorators as td @@ -956,17 +957,95 @@ def test_min_empty_string_dtype(func, string_dtype_no_object): @pytest.mark.parametrize("min_count", [0, 1]) -def test_string_dtype_empty_sum(string_dtype_no_object, skipna, min_count): - # https://github.com/pandas-dev/pandas/issues/60229 +@pytest.mark.parametrize("test_series", [True, False]) +def test_string_dtype_all_na( + string_dtype_no_object, reduction_func, skipna, min_count, test_series +): + # https://github.com/pandas-dev/pandas/issues/60985 + if reduction_func == "corrwith": + # corrwith is deprecated. + return + dtype = string_dtype_no_object + + if reduction_func in [ + "any", + "all", + "idxmin", + "idxmax", + "mean", + "median", + "std", + "var", + ]: + kwargs = {"skipna": skipna} + elif reduction_func in ["kurt"]: + kwargs = {"min_count": min_count} + elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]: + kwargs = {} + else: + kwargs = {"skipna": skipna, "min_count": min_count} + + expected_dtype, expected_value = dtype, pd.NA + if reduction_func in ["all", "any"]: + expected_dtype = "bool" + # TODO: For skipna=False, bool(pd.NA) raises; should groupby? + expected_value = not skipna if reduction_func == "any" else True + elif reduction_func in ["count", "nunique", "size"]: + # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA? + if ( + test_series + and reduction_func == "size" + and dtype.storage == "pyarrow" + and dtype.na_value is pd.NA + ): + expected_dtype = "Int64" + else: + expected_dtype = "int64" + expected_value = 1 if reduction_func == "size" else 0 + elif reduction_func in ["idxmin", "idxmax"]: + expected_dtype, expected_value = "float64", np.nan + elif not skipna or min_count > 0: + expected_value = pd.NA + elif reduction_func == "sum": + # https://github.com/pandas-dev/pandas/pull/60936 + expected_value = "" + df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype) - gb = df.groupby("a") - result = gb.sum(skipna=skipna, min_count=min_count) - value = "" if skipna and min_count == 0 else pd.NA - expected = DataFrame( - {"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype - ) - tm.assert_frame_equal(result, expected) + obj = df["b"] if test_series else df + args = get_groupby_method_args(reduction_func, obj) + gb = obj.groupby(df["a"]) + method = getattr(gb, reduction_func) + + if reduction_func in [ + "mean", + "median", + "kurt", + "prod", + "quantile", + "sem", + "skew", + "std", + "var", + ]: + msg = f"dtype '{dtype}' does not support operation '{reduction_func}'" + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + return + elif reduction_func in ["idxmin", "idxmax"] and not skipna: + msg = f"{reduction_func} with skipna=False encountered an NA value." + with pytest.raises(ValueError, match=msg): + method(*args, **kwargs) + return + + result = method(*args, **kwargs) + index = pd.Index(["x"], name="a", dtype=dtype) + if test_series or reduction_func == "size": + name = None if not test_series and reduction_func == "size" else "b" + expected = Series(expected_value, index=index, dtype=expected_dtype, name=name) + else: + expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype) + tm.assert_equal(result, expected) def test_max_nan_bug(): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index e09883e95ecec..1ec936f830768 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -802,8 +802,9 @@ def test_frequency_A_raises(self, freq): ) def test_date_range_depr_lowercase_frequency(self, freq, freq_depr): # GH#58998 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - "in a future version." + depr_msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) expected = date_range("1/1/2000", periods=4, freq=freq) with tm.assert_produces_warning(FutureWarning, match=depr_msg): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 7ef6efad0ff6f..7a68cb867c94e 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -201,6 +201,69 @@ def test_union_same_timezone_different_units(self): expected = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") tm.assert_index_equal(result, expected) + def test_union_same_nonzero_timezone_different_units(self): + # GH 60080 - fix timezone being changed to UTC when units differ + # but timezone is the same + tz = "UTC+05:00" + idx1 = date_range("2000-01-01", periods=3, tz=tz).as_unit("us") + idx2 = date_range("2000-01-01", periods=3, tz=tz).as_unit("ns") + + # Check pre-conditions + assert idx1.tz == idx2.tz + assert idx1.dtype != idx2.dtype # Different units + + # Test union preserves timezone when units differ + result = idx1.union(idx2) + expected = date_range("2000-01-01", periods=3, tz=tz).as_unit("ns") + tm.assert_index_equal(result, expected) + + def test_union_different_dates_same_timezone_different_units(self): + # GH 60080 - fix timezone being changed to UTC when units differ + # but timezone is the same + tz = "UTC+05:00" + idx1 = date_range("2000-01-01", periods=3, tz=tz).as_unit("us") + idx3 = date_range("2000-01-03", periods=3, tz=tz).as_unit("us") + + # Test with different dates to ensure it's not just returning one of the inputs + result = idx1.union(idx3) + expected = DatetimeIndex( + ["2000-01-01", "2000-01-02", "2000-01-03", "2000-01-04", "2000-01-05"], + tz=tz, + ).as_unit("us") + tm.assert_index_equal(result, expected) + + def test_intersection_same_timezone_different_units(self): + # GH 60080 - fix timezone being changed to UTC when units differ + # but timezone is the same + tz = "UTC+05:00" + idx1 = date_range("2000-01-01", periods=3, tz=tz).as_unit("us") + idx2 = date_range("2000-01-01", periods=3, tz=tz).as_unit("ns") + + # Check pre-conditions + assert idx1.tz == idx2.tz + assert idx1.dtype != idx2.dtype # Different units + + # Test intersection + result = idx1.intersection(idx2) + expected = date_range("2000-01-01", periods=3, tz=tz).as_unit("ns") + tm.assert_index_equal(result, expected) + + def test_symmetric_difference_same_timezone_different_units(self): + # GH 60080 - fix timezone being changed to UTC when units differ + # but timezone is the same + tz = "UTC+05:00" + idx1 = date_range("2000-01-01", periods=3, tz=tz).as_unit("us") + idx4 = date_range("2000-01-02", periods=3, tz=tz).as_unit("ns") + + # Check pre-conditions + assert idx1.tz == idx4.tz + assert idx1.dtype != idx4.dtype # Different units + + # Test symmetric_difference + result = idx1.symmetric_difference(idx4) + expected = DatetimeIndex(["2000-01-01", "2000-01-04"], tz=tz).as_unit("ns") + tm.assert_index_equal(result, expected) + # TODO: moved from test_datetimelike; de-duplicate with version below def test_intersection2(self): first = date_range("2020-01-01", periods=10) diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 51b03024ce272..85af43e7d2e5e 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -206,8 +206,10 @@ def test_constructor_U(self): @pytest.mark.parametrize("freq_depr", ["2MIN", "2US", "2NS"]) def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): # GH#52536, GH#54939 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq_depr.lower()[1:]}' instead." + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + ) with tm.assert_produces_warning(FutureWarning, match=msg): period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) @@ -230,8 +232,10 @@ def test_A_raises_from_time_series(self, freq): @pytest.mark.parametrize("freq", ["2w"]) def test_lowercase_freq_from_time_series_deprecated(self, freq): # GH#52536, GH#54939 - msg = f"'{freq[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq.upper()[1:]}' instead." + msg = ( + f"'{freq[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq.upper()[1:]}' instead." + ) with tm.assert_produces_warning(FutureWarning, match=msg): period_range(freq=freq, start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 2f6998a85c80b..3be69617cad43 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -726,15 +726,16 @@ def test_iloc_setitem_with_scalar_index(self, indexer, value): @pytest.mark.filterwarnings("ignore::UserWarning") def test_iloc_mask(self): - # GH 3631, iloc with a mask (of a series) should raise + # GH 60994, iloc with a mask (of a series) should return accordingly df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) mask = df.a % 2 == 0 msg = "iLocation based boolean indexing cannot use an indexable as a mask" with pytest.raises(ValueError, match=msg): df.iloc[mask] + mask.index = range(len(mask)) - msg = "iLocation based boolean indexing on an integer type is not available" - with pytest.raises(NotImplementedError, match=msg): + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): df.iloc[mask] # ndarray ok @@ -753,18 +754,13 @@ def test_iloc_mask(self): (None, ".iloc"): "0b1100", ("index", ""): "0b11", ("index", ".loc"): "0b11", - ("index", ".iloc"): ( - "iLocation based boolean indexing cannot use an indexable as a mask" - ), - ("locs", ""): "Unalignable boolean Series provided as indexer " - "(index of the boolean Series and of the indexed " - "object do not match).", - ("locs", ".loc"): "Unalignable boolean Series provided as indexer " - "(index of the boolean Series and of the " - "indexed object do not match).", - ("locs", ".iloc"): ( - "iLocation based boolean indexing on an integer type is not available" - ), + ( + "index", + ".iloc", + ): "iLocation based boolean indexing cannot use an indexable as a mask", + ("locs", ""): "Unalignable boolean Series provided as indexer", + ("locs", ".loc"): "Unalignable boolean Series provided as indexer", + ("locs", ".iloc"): "Unalignable boolean Series provided as indexer", } # UserWarnings from reindex of a boolean mask @@ -780,18 +776,52 @@ def test_iloc_mask(self): else: accessor = df answer = str(bin(accessor[mask]["nums"].sum())) - except (ValueError, IndexingError, NotImplementedError) as err: + except (ValueError, IndexingError) as err: answer = str(err) key = ( idx, method, ) - r = expected.get(key) - if r != answer: - raise AssertionError( - f"[{key}] does not match [{answer}], received [{r}]" + expected_result = expected.get(key) + + # Fix the assertion to check for substring match + if ( + idx is None or (idx == "index" and method != ".iloc") + ) and "0b" in expected_result: + # For successful numeric results, exact match is needed + assert expected_result == answer, ( + f"[{key}] does not match [{answer}]" ) + else: + # For error messages, substring match is sufficient + assert expected_result in answer, f"[{key}] not found in [{answer}]" + + def test_iloc_with_numpy_bool_array(self): + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + result = df.iloc[np.array([True, False, True, False, True], dtype=bool)] + expected = DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"]) + tm.assert_frame_equal(result, expected) + + def test_iloc_series_mask_with_index_mismatch_raises(self): + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): + df.iloc[Series([True] * len(mask), dtype=bool)] + + def test_iloc_series_mask_all_true(self): + df = DataFrame(list(range(5)), columns=["a"]) + mask = Series([True] * len(df), dtype=bool) + result = df.iloc[mask] + tm.assert_frame_equal(result, df) + + def test_iloc_series_mask_alternate_true(self): + df = DataFrame(list(range(5)), columns=["a"]) + mask = Series([True, False, True, False, True], dtype=bool) + result = df.iloc[mask] + expected = DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4]) + tm.assert_frame_equal(result, expected) def test_iloc_non_unique_indexing(self): # GH 4017, non-unique indexing (on the axis) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 8d46442611719..ebc6ff5be108f 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -824,6 +824,46 @@ def test_to_latex_escape_special_chars(self): ) assert result == expected + def test_to_latex_escape_special_chars_in_index_names(self): + # https://github.com/pandas-dev/pandas/issues/61309 + # https://github.com/pandas-dev/pandas/issues/57362 + index = "&%$#_{}}~^\\" + df = DataFrame({index: [1, 2, 3]}).set_index(index) + result = df.to_latex(escape=True) + expected = _dedent( + r""" + \begin{tabular}{l} + \toprule + \&\%\$\#\_\{\}\}\textasciitilde \textasciicircum \textbackslash \\ + \midrule + 1 \\ + 2 \\ + 3 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + + def test_to_latex_escape_special_chars_in_column_name(self): + df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) + df.columns.name = "_^~" + result = df.to_latex(escape=True) + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + \_\textasciicircum \textasciitilde & A & B \\ + \midrule + 0 & 1 & a \\ + 1 & 2 & b \\ + 2 & 3 & c \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + def test_to_latex_specified_header_special_chars_without_escape(self): # GH 7124 df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 63c975fd831e7..0866581535c2f 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -777,9 +777,9 @@ def test_to_string_string_dtype(self): result = df.dtypes.to_string() expected = dedent( """\ - x string[pyarrow] - y string[python] - z int64[pyarrow]""" + x string + y string + z int64[pyarrow]""" ) assert result == expected diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 3a68d38cc0bde..213fa2c01cef4 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -546,7 +546,7 @@ def test_na_values_dict_null_column_name(all_parsers): parser = all_parsers data = ",x,y\n\nMA,1,2\nNA,2,1\nOA,,3" names = [None, "x", "y"] - na_values = {name: STR_NA_VALUES for name in names} + na_values = dict.fromkeys(names, STR_NA_VALUES) dtype = {None: "object", "x": "float64", "y": "float64"} if parser.engine == "pyarrow": diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index bb2058c050f2a..5cfefeb469e8a 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import PY312 import pandas as pd @@ -25,7 +23,9 @@ timedelta_range, ) import pandas._testing as tm -from pandas.conftest import has_pyarrow +from pandas.api.types import ( + CategoricalDtype, +) from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, @@ -385,20 +385,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) -@pytest.mark.xfail( - using_string_dtype() and has_pyarrow, - reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", -) @pytest.mark.parametrize("format", ["fixed", "table"]) -def test_to_hdf_errors(tmp_path, format, setup_path): +def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string): data = ["\ud800foo"] - ser = Series(data, index=Index(data)) + ser = Series(data, index=Index(data, dtype="object"), dtype="object") path = tmp_path / setup_path # GH 20835 ser.to_hdf(path, key="table", format=format, errors="surrogatepass") result = read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) + + if using_infer_string: + # https://github.com/pandas-dev/pandas/pull/60993 + # Surrogates fallback to python storage. + dtype = pd.StringDtype(storage="python", na_value=np.nan) + else: + dtype = "object" + expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype) + tm.assert_series_equal(result, expected) def test_create_table_index(setup_path): @@ -1106,3 +1110,23 @@ def test_store_bool_index(tmp_path, setup_path): df.to_hdf(path, key="a") result = read_hdf(path, "a") tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("model", ["name", "longname", "verylongname"]) +def test_select_categorical_string_columns(tmp_path, model): + # Corresponding to BUG: 57608 + + path = tmp_path / "test.h5" + + models = CategoricalDtype(categories=["name", "longname", "verylongname"]) + df = DataFrame( + {"modelId": ["name", "longname", "longname"], "value": [1, 2, 3]} + ).astype({"modelId": models, "value": int}) + + with HDFStore(path, "w") as store: + store.append("df", df, data_columns=["modelId"]) + + with HDFStore(path, "r") as store: + result = store.select("df", "modelId == model") + expected = df[df["modelId"] == model] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 99af421d5aa48..4a5e41397b59d 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -650,7 +650,7 @@ def close(self): handles.created_handles.append(TestError()) -@td.skip_if_no("fsspec", min_version="2023.1.0") +@td.skip_if_no("fsspec") @pytest.mark.parametrize("compression", [None, "infer"]) def test_read_csv_chained_url_no_error(compression): # GH 60100 diff --git a/pandas/tests/io/test_iceberg.py b/pandas/tests/io/test_iceberg.py new file mode 100644 index 0000000000000..916c1d2af9b12 --- /dev/null +++ b/pandas/tests/io/test_iceberg.py @@ -0,0 +1,222 @@ +""" +Tests for the Apache Iceberg format. + +Tests in this file use a simple Iceberg catalog based on SQLite, with the same +data used for Parquet tests (``pandas/tests/io/data/parquet/simple.parquet``). +""" + +import collections +import importlib +import pathlib + +import pytest + +import pandas as pd +import pandas._testing as tm + +from pandas.io.iceberg import read_iceberg + +pytestmark = pytest.mark.single_cpu + +pyiceberg = pytest.importorskip("pyiceberg") +pyiceberg_catalog = pytest.importorskip("pyiceberg.catalog") +pq = pytest.importorskip("pyarrow.parquet") + +Catalog = collections.namedtuple("Catalog", ["name", "uri", "warehouse"]) + + +@pytest.fixture +def catalog(request, tmp_path): + # the catalog stores the full path of data files, so the catalog needs to be + # created dynamically, and not saved in pandas/tests/io/data as other formats + uri = f"sqlite:///{tmp_path}/catalog.sqlite" + warehouse = f"file://{tmp_path}" + catalog_name = request.param if hasattr(request, "param") else None + catalog = pyiceberg_catalog.load_catalog( + catalog_name or "default", + type="sql", + uri=uri, + warehouse=warehouse, + ) + catalog.create_namespace("ns") + + df = pq.read_table( + pathlib.Path(__file__).parent / "data" / "parquet" / "simple.parquet" + ) + table = catalog.create_table("ns.my_table", schema=df.schema) + table.append(df) + + if catalog_name is not None: + config_path = pathlib.Path.home() / ".pyiceberg.yaml" + with open(config_path, "w", encoding="utf-8") as f: + f.write(f"""\ +catalog: + {catalog_name}: + type: sql + uri: {uri} + warehouse: {warehouse}""") + + importlib.reload(pyiceberg_catalog) # needed to reload the config file + + yield Catalog(name=catalog_name or "default", uri=uri, warehouse=warehouse) + + if catalog_name is not None: + config_path.unlink() + + +class TestIceberg: + def test_read(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True) + def test_read_by_catalog_name(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_name=catalog.name, + ) + tm.assert_frame_equal(result, expected) + + def test_read_with_row_filter(self, catalog): + expected = pd.DataFrame( + { + "A": [2, 3], + "B": ["foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + row_filter="A > 1", + ) + tm.assert_frame_equal(result, expected) + + def test_read_with_case_sensitive(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + selected_fields=["a"], + case_sensitive=False, + ) + tm.assert_frame_equal(result, expected) + + with pytest.raises(ValueError, match="^Could not find column"): + read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + selected_fields=["a"], + case_sensitive=True, + ) + + def test_read_with_limit(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2], + "B": ["foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + limit=2, + ) + tm.assert_frame_equal(result, expected) + + def test_write(self, catalog): + df = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + df.to_iceberg( + "ns.new_table", + catalog_properties={"uri": catalog.uri}, + location=catalog.warehouse, + ) + result = read_iceberg( + "ns.new_table", + catalog_properties={"uri": catalog.uri}, + ) + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True) + def test_write_by_catalog_name(self, catalog): + df = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + df.to_iceberg( + "ns.new_table", + catalog_name=catalog.name, + ) + result = read_iceberg( + "ns.new_table", + catalog_name=catalog.name, + ) + tm.assert_frame_equal(result, df) + + def test_write_existing_table_with_append_true(self, catalog): + original = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + ) + new = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + expected = pd.concat([original, new], ignore_index=True) + new.to_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + location=catalog.warehouse, + append=True, + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + ) + tm.assert_frame_equal(result, expected) + + def test_write_existing_table_with_append_false(self, catalog): + df = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + df.to_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + location=catalog.warehouse, + append=False, + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + ) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 950f74a686b8d..973cb21ac3041 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -169,14 +169,9 @@ def test_spss_metadata(datapath): "variable_measure": {"VAR00002": "unknown"}, "file_label": None, "file_format": "sav/zsav", + "creation_time": datetime.datetime(2015, 2, 6, 14, 33, 36), + "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), } - if Version(pyreadstat.__version__) >= Version("1.2.4"): - metadata.update( - { - "creation_time": datetime.datetime(2015, 2, 6, 14, 33, 36), - "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), - } - ) if Version(pyreadstat.__version__) >= Version("1.2.8"): metadata["mr_sets"] = {} tm.assert_dict_equal(df.attrs, metadata) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 13576c891ad2c..4a6a5635eb68c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1,7 +1,6 @@ from __future__ import annotations import contextlib -from contextlib import closing import csv from datetime import ( date, @@ -2498,10 +2497,8 @@ def test_sqlalchemy_integer_overload_mapping(conn, request, integer): sql.SQLTable("test_type", db, frame=df) -@pytest.mark.parametrize("conn", all_connectable) -def test_database_uri_string(conn, request, test_frame1): +def test_database_uri_string(request, test_frame1): pytest.importorskip("sqlalchemy") - conn = request.getfixturevalue(conn) # Test read_sql and .to_sql method with a database URI (GH10654) # db_uri = 'sqlite:///:memory:' # raises # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near @@ -2520,10 +2517,8 @@ def test_database_uri_string(conn, request, test_frame1): @td.skip_if_installed("pg8000") -@pytest.mark.parametrize("conn", all_connectable) -def test_pg8000_sqlalchemy_passthrough_error(conn, request): +def test_pg8000_sqlalchemy_passthrough_error(request): pytest.importorskip("sqlalchemy") - conn = request.getfixturevalue(conn) # using driver that will not be installed on CI to trigger error # in sqlalchemy.create_engine -> test passing of this error to user db_uri = "postgresql+pg8000://user:pass@host/dbname" @@ -2584,10 +2579,10 @@ def test_sql_open_close(test_frame3): # between the writing and reading (as in many real situations). with tm.ensure_clean() as name: - with closing(sqlite3.connect(name)) as conn: + with contextlib.closing(sqlite3.connect(name)) as conn: assert sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) == 4 - with closing(sqlite3.connect(name)) as conn: + with contextlib.closing(sqlite3.connect(name)) as conn: result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) tm.assert_frame_equal(test_frame3, result) @@ -2731,25 +2726,26 @@ def test_delete_rows_is_atomic(conn_name, request): replacing_df = DataFrame({"a": [5, 6, 7], "b": [8, 8, 8]}, dtype="int32") conn = request.getfixturevalue(conn_name) - pandasSQL = pandasSQL_builder(conn) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction() as cur: + cur.execute(table_stmt) - with pandasSQL.run_transaction() as cur: - cur.execute(table_stmt) + with pandasSQL.run_transaction(): + pandasSQL.to_sql(original_df, table_name, if_exists="append", index=False) - with pandasSQL.run_transaction(): - pandasSQL.to_sql(original_df, table_name, if_exists="append", index=False) + # inserting duplicated values in a UNIQUE constraint column + with pytest.raises(pd.errors.DatabaseError): + with pandasSQL.run_transaction(): + pandasSQL.to_sql( + replacing_df, table_name, if_exists="delete_rows", index=False + ) - # inserting duplicated values in a UNIQUE constraint column - with pytest.raises(pd.errors.DatabaseError): + # failed "delete_rows" is rolled back preserving original data with pandasSQL.run_transaction(): - pandasSQL.to_sql( - replacing_df, table_name, if_exists="delete_rows", index=False + result_df = pandasSQL.read_query( + f"SELECT * FROM {table_name}", dtype="int32" ) - - # failed "delete_rows" is rolled back preserving original data - with pandasSQL.run_transaction(): - result_df = pandasSQL.read_query(f"SELECT * FROM {table_name}", dtype="int32") - tm.assert_frame_equal(result_df, original_df) + tm.assert_frame_equal(result_df, original_df) @pytest.mark.parametrize("conn", all_connectable) @@ -2759,10 +2755,10 @@ def test_roundtrip(conn, request, test_frame1): conn_name = conn conn = request.getfixturevalue(conn) - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 - result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 + result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") if "adbc" in conn_name: result = result.rename(columns={"__index_level_0__": "level_0"}) @@ -3456,13 +3452,6 @@ def test_to_sql_with_negative_npinf(conn, request, input): # GH 36465 # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error # for pymysql version >= 0.10 - # TODO(GH#36465): remove this version check after GH 36465 is fixed - pymysql = pytest.importorskip("pymysql") - - if Version(pymysql.__version__) < Version("1.0.3") and "infe0" in df.columns: - mark = pytest.mark.xfail(reason="GH 36465") - request.applymarker(mark) - msg = "Execution failed on sql" with pytest.raises(pd.errors.DatabaseError, match=msg): df.to_sql(name="foobar", con=conn, index=False) @@ -3584,13 +3573,6 @@ def test_options_get_engine(): assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) -def test_get_engine_auto_error_message(): - # Expect different error messages from get_engine(engine="auto") - # if engines aren't installed vs. are installed but bad version - pass - # TODO(GH#36893) fill this in when we add more engines - - @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) def test_read_sql_dtype_backend( diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 9288b98d79fbe..e73de78847c8f 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2587,3 +2587,17 @@ def test_many_strl(temp_file, version): lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))] value_labels = {"col": {i: lbls[i] for i in range(n)}} df.to_stata(temp_file, value_labels=value_labels, version=version) + + +@pytest.mark.parametrize("version", [117, 118, 119, None]) +def test_strl_missings(temp_file, version): + # GH 23633 + # Check that strl supports None and pd.NA + df = DataFrame( + [ + {"str1": "string" * 500, "number": 0}, + {"str1": None, "number": 1}, + {"str1": pd.NA, "number": 1}, + ] + ) + df.to_stata(temp_file, version=version) diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 50fef2c5eb4eb..4446dbe320b69 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -1345,7 +1345,7 @@ def test_ea_dtypes(any_numeric_ea_dtype, parser): assert equalize_decl(result).strip() == expected -def test_unsuported_compression(parser, geom_df): +def test_unsupported_compression(parser, geom_df): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: geom_df.to_xml(path, parser=parser, compression="7z") diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index d897d251909fe..cf8ae28c4d9b5 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1961,7 +1961,7 @@ def test_wrong_compression(parser, compression, compression_only): read_xml(path, parser=parser, compression=attempted_compression) -def test_unsuported_compression(parser): +def test_unsupported_compression(parser): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: read_xml(path, parser=parser, compression="7z") diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 17dae1879f3b8..f619ba4dd204b 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -297,3 +297,13 @@ def test_ensure_string_array_copy(): assert not np.shares_memory(arr, result) assert arr[1] is None assert result[1] is np.nan + + +def test_ensure_string_array_list_of_lists(): + # GH#61155: ensure list of lists doesn't get converted to string + arr = [list("test"), list("word")] + result = lib.ensure_string_array(arr) + + # Each item in result should still be a list, not a stringified version + expected = np.array(["['t', 'e', 's', 't']", "['w', 'o', 'r', 'd']"], dtype=object) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index d18f098267599..3f274a336ad44 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -840,14 +840,26 @@ def test_plot_scatter_shape(self): axes = df.plot(x="x", y="y", kind="scatter", subplots=True) _check_axes_shape(axes, axes_num=1, layout=(1, 1)) - def test_raise_error_on_datetime_time_data(self): - # GH 8113, datetime.time type is not supported by matplotlib in scatter + def test_scatter_on_datetime_time_data(self): + # datetime.time type is now supported in scatter, since a converter + # is implemented in ScatterPlot df = DataFrame(np.random.default_rng(2).standard_normal(10), columns=["a"]) df["dtime"] = date_range(start="2014-01-01", freq="h", periods=10).time - msg = "must be a string or a (real )?number, not 'datetime.time'" + df.plot(kind="scatter", x="dtime", y="a") - with pytest.raises(TypeError, match=msg): - df.plot(kind="scatter", x="dtime", y="a") + def test_scatter_line_xticks(self): + # GH#61005 + df = DataFrame( + [(datetime(year=2025, month=1, day=1, hour=n), n) for n in range(3)], + columns=["datetime", "y"], + ) + fig, ax = plt.subplots(2, sharex=True) + df.plot.scatter(x="datetime", y="y", ax=ax[0]) + scatter_xticks = ax[0].get_xticks() + df.plot(x="datetime", y="y", ax=ax[1]) + line_xticks = ax[1].get_xticks() + assert scatter_xticks[0] == line_xticks[0] + assert scatter_xticks[-1] == line_xticks[-1] @pytest.mark.parametrize("x, y", [("dates", "vals"), (0, 1)]) def test_scatterplot_datetime_data(self, x, y): diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py index 74ee45664e01a..5e5c3539f3283 100644 --- a/pandas/tests/plotting/frame/test_frame_color.py +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -13,7 +13,6 @@ _check_plot_works, _unpack_cycler, ) -from pandas.util.version import Version mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") @@ -715,10 +714,7 @@ def test_colors_of_columns_with_same_name(self): df_concat = pd.concat([df, df1], axis=1) result = df_concat.plot() legend = result.get_legend() - if Version(mpl.__version__) < Version("3.7"): - handles = legend.legendHandles - else: - handles = legend.legend_handles + handles = legend.legend_handles for legend, line in zip(handles, result.lines): assert legend.get_color() == line.get_color() diff --git a/pandas/tests/plotting/frame/test_frame_legend.py b/pandas/tests/plotting/frame/test_frame_legend.py index a9723fe4ef871..755293e0bf6d7 100644 --- a/pandas/tests/plotting/frame/test_frame_legend.py +++ b/pandas/tests/plotting/frame/test_frame_legend.py @@ -12,7 +12,6 @@ _check_legend_marker, _check_text_labels, ) -from pandas.util.version import Version mpl = pytest.importorskip("matplotlib") @@ -32,10 +31,7 @@ def test_mixed_yerr(self): df.plot("x", "b", c="blue", yerr=None, ax=ax, label="blue") legend = ax.get_legend() - if Version(mpl.__version__) < Version("3.7"): - result_handles = legend.legendHandles - else: - result_handles = legend.legend_handles + result_handles = legend.legend_handles assert isinstance(result_handles[0], mpl.collections.LineCollection) assert isinstance(result_handles[1], mpl.lines.Line2D) @@ -48,10 +44,7 @@ def test_legend_false(self): ax = df.plot(legend=True, color={"a": "blue", "b": "green"}, secondary_y="b") df2.plot(legend=True, color={"d": "red"}, ax=ax) legend = ax.get_legend() - if Version(mpl.__version__) < Version("3.7"): - handles = legend.legendHandles - else: - handles = legend.legend_handles + handles = legend.legend_handles result = [handle.get_color() for handle in handles] expected = ["blue", "green", "red"] assert result == expected diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 43e1255404784..d3e1d7f60384b 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -31,6 +31,8 @@ plt = pytest.importorskip("matplotlib.pyplot") cm = pytest.importorskip("matplotlib.cm") +import re + from pandas.plotting._matplotlib.style import get_standard_colors @@ -681,3 +683,182 @@ def test_bar_plt_xaxis_intervalrange(self): (a.get_text() == b.get_text()) for a, b in zip(s.plot.bar().get_xticklabels(), expected) ) + + +@pytest.fixture +def df_bar_data(): + return np.random.default_rng(3).integers(0, 100, 5) + + +@pytest.fixture +def df_bar_df(df_bar_data) -> DataFrame: + df_bar_df = DataFrame( + { + "A": df_bar_data, + "B": df_bar_data[::-1], + "C": df_bar_data[0], + "D": df_bar_data[-1], + } + ) + return df_bar_df + + +def _df_bar_xyheight_from_ax_helper(df_bar_data, ax, subplot_division): + subplot_data_df_list = [] + + # get xy and height of squares representing data, separated by subplots + for i in range(len(subplot_division)): + subplot_data = np.array( + [ + (x.get_x(), x.get_y(), x.get_height()) + for x in ax[i].findobj(plt.Rectangle) + if x.get_height() in df_bar_data + ] + ) + subplot_data_df_list.append( + DataFrame(data=subplot_data, columns=["x_coord", "y_coord", "height"]) + ) + + return subplot_data_df_list + + +def _df_bar_subplot_checker(df_bar_data, df_bar_df, subplot_data_df, subplot_columns): + subplot_sliced_by_source = [ + subplot_data_df.iloc[ + len(df_bar_data) * i : len(df_bar_data) * (i + 1) + ].reset_index() + for i in range(len(subplot_columns)) + ] + + if len(subplot_columns) == 1: + expected_total_height = df_bar_df.loc[:, subplot_columns[0]] + else: + expected_total_height = df_bar_df.loc[:, subplot_columns].sum(axis=1) + + for i in range(len(subplot_columns)): + sliced_df = subplot_sliced_by_source[i] + if i == 0: + # Checks that the bar chart starts y=0 + assert (sliced_df["y_coord"] == 0).all() + height_iter = sliced_df["y_coord"].add(sliced_df["height"]) + else: + height_iter = height_iter + sliced_df["height"] + + if i + 1 == len(subplot_columns): + # Checks final height matches what is expected + tm.assert_series_equal( + height_iter, expected_total_height, check_names=False, check_dtype=False + ) + else: + # Checks each preceding bar ends where the next one starts + next_start_coord = subplot_sliced_by_source[i + 1]["y_coord"] + tm.assert_series_equal( + height_iter, next_start_coord, check_names=False, check_dtype=False + ) + + +# GH Issue 61018 +@pytest.mark.parametrize("columns_used", [["A", "B"], ["C", "D"], ["D", "A"]]) +def test_bar_1_subplot_1_double_stacked(df_bar_data, df_bar_df, columns_used): + df_bar_df_trimmed = df_bar_df[columns_used] + subplot_division = [columns_used] + ax = df_bar_df_trimmed.plot(subplots=subplot_division, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df_trimmed, subplot_data_df_list[i], subplot_division[i] + ) + + +@pytest.mark.parametrize( + "columns_used", [["A", "B", "C"], ["A", "C", "B"], ["D", "A", "C"]] +) +def test_bar_2_subplot_1_double_stacked(df_bar_data, df_bar_df, columns_used): + df_bar_df_trimmed = df_bar_df[columns_used] + subplot_division = [(columns_used[0], columns_used[1]), (columns_used[2],)] + ax = df_bar_df_trimmed.plot(subplots=subplot_division, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df_trimmed, subplot_data_df_list[i], subplot_division[i] + ) + + +@pytest.mark.parametrize( + "subplot_division", + [ + [("A", "B"), ("C", "D")], + [("A", "D"), ("C", "B")], + [("B", "C"), ("D", "A")], + [("B", "D"), ("C", "A")], + ], +) +def test_bar_2_subplot_2_double_stacked(df_bar_data, df_bar_df, subplot_division): + ax = df_bar_df.plot(subplots=subplot_division, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i] + ) + + +@pytest.mark.parametrize( + "subplot_division", + [[("A", "B", "C")], [("A", "D", "B")], [("C", "A", "D")], [("D", "C", "A")]], +) +def test_bar_2_subplots_1_triple_stacked(df_bar_data, df_bar_df, subplot_division): + ax = df_bar_df.plot(subplots=subplot_division, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i] + ) + + +def test_bar_subplots_stacking_bool(df_bar_data, df_bar_df): + subplot_division = [("A"), ("B"), ("C"), ("D")] + ax = df_bar_df.plot(subplots=True, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i] + ) + + +def test_plot_bar_label_count_default(): + df = DataFrame( + [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD") + ) + df.plot(subplots=True, kind="bar", title=["A", "B", "C", "D"]) + + +def test_plot_bar_label_count_expected_fail(): + df = DataFrame( + [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD") + ) + error_regex = re.escape( + "The number of titles (4) must equal the number of subplots (3)." + ) + with pytest.raises(ValueError, match=error_regex): + df.plot( + subplots=[("A", "B")], + kind="bar", + title=["A&B", "C", "D", "Extra Title"], + ) + + +def test_plot_bar_label_count_expected_success(): + df = DataFrame( + [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD") + ) + df.plot(subplots=[("A", "B", "D")], kind="bar", title=["A&B&D", "C"]) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index c3b0219971446..98e70f770896c 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -971,3 +971,27 @@ def test_secondary_y_subplot_axis_labels(self): s1.plot(ax=ax2) assert len(ax.xaxis.get_minor_ticks()) == 0 assert len(ax.get_xticklabels()) > 0 + + def test_bar_line_plot(self): + """ + Test that bar and line plots with the same x values are superposed + and that the x limits are set such that the plots are visible. + """ + # GH61161 + index = period_range("2023", periods=3, freq="Y") + years = set(index.year.astype(str)) + s = Series([1, 2, 3], index=index) + ax = plt.subplot() + s.plot(kind="bar", ax=ax) + bar_xticks = [ + label for label in ax.get_xticklabels() if label.get_text() in years + ] + s.plot(kind="line", ax=ax, color="r") + line_xticks = [ + label for label in ax.get_xticklabels() if label.get_text() in years + ] + assert len(bar_xticks) == len(index) + assert bar_xticks == line_xticks + x_limits = ax.get_xlim() + assert x_limits[0] <= bar_xticks[0].get_position()[0] + assert x_limits[1] >= bar_xticks[-1].get_position()[0] diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index a7bb80727206e..485b50f65736e 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -780,6 +780,12 @@ def test_var_masked_array(self, ddof, exp): assert result == result_numpy_dtype assert result == exp + def test_var_complex_array(self): + # GH#61645 + ser = Series([-1j, 0j, 1j], dtype=complex) + assert ser.var(ddof=1) == 1.0 + assert ser.std(ddof=1) == 1.0 + @pytest.mark.parametrize("dtype", ("m8[ns]", "M8[ns]", "M8[ns, UTC]")) def test_empty_timeseries_reductions_return_nat(self, dtype, skipna): # covers GH#11245 diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index eb4ba6a3fdf71..d9bd89af61aaf 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -123,20 +123,20 @@ def test_resample_interpolate_regular_sampling_off_grid( ser = Series(np.arange(5.0), index) method = all_1d_no_arg_interpolation_methods - # Resample to 1 hour sampling and interpolate with the given method - ser_resampled = ser.resample("1h").interpolate(method) - - # Check that none of the resampled values are NaN, except the first one - # which lies 1 minute before the first actual data point - assert np.isnan(ser_resampled.iloc[0]) - assert not ser_resampled.iloc[1:].isna().any() - - if method not in ["nearest", "zero"]: - # Check that the resampled values are close to the expected values - # except for methods with known inaccuracies - assert np.all( - np.isclose(ser_resampled.values[1:], np.arange(0.5, 4.5, 0.5), rtol=1.0e-1) - ) + result = ser.resample("1h").interpolate(method) + + if method == "linear": + values = np.repeat(np.arange(0.0, 4.0), 2) + np.tile([1 / 3, 2 / 3], 4) + elif method == "nearest": + values = np.repeat(np.arange(0.0, 5.0), 2)[1:-1] + elif method == "zero": + values = np.repeat(np.arange(0.0, 4.0), 2) + else: + values = 0.491667 + np.arange(0.0, 4.0, 0.5) + values = np.insert(values, 0, np.nan) + index = date_range("2000-01-01 00:00:00", periods=9, freq="1h") + expected = Series(values, index=index) + tm.assert_series_equal(result, expected) def test_resample_interpolate_irregular_sampling(all_1d_no_arg_interpolation_methods): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3a7fd548ca961..f871c0bf0218c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2155,6 +2155,16 @@ def test_arrow_timestamp_resample(tz): tm.assert_series_equal(result, expected) +@td.skip_if_no("pyarrow") +def test_arrow_timestamp_resample_keep_index_name(): + # https://github.com/pandas-dev/pandas/issues/61222 + idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") + expected = Series(np.arange(5, dtype=np.float64), index=idx) + expected.index.name = "index_name" + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("freq", ["1A", "2A-MAR"]) def test_resample_A_raises(freq): msg = f"Invalid frequency: {freq[1:]}" diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 7870c5a9d3e17..286625b8ce470 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -462,7 +460,6 @@ def test_empty(keys): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3cc95922e7f2f..e6cfa12f5f61a 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -430,13 +430,7 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) ) expected = DataFrame( - data={ - "price": [ - 10.0, - 9.21131, - 11.0, - ] - }, + data={"price": [10.0, 9.5, 11.0]}, index=expected_ind, ) tm.assert_frame_equal(result, expected, check_names=False) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0f67aebd85ec..f3418ad047afe 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3060,3 +3060,12 @@ def test_merge_on_all_nan_column(): {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("suffixes", [("_dup", ""), ("", "_dup")]) +def test_merge_for_suffix_collisions(suffixes): + # GH#61402 + df1 = DataFrame({"col1": [1], "col2": [2]}) + df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]}) + with pytest.raises(MergeError, match="duplicate columns"): + merge(df1, df2, on="col1", suffixes=suffixes) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 95aa5291cb45a..02544c9518d10 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -555,6 +555,14 @@ def test_melt_multiindex_columns_var_name_too_many(self): ): df.melt(var_name=["first", "second", "third"]) + def test_melt_duplicate_column_header_raises(self): + # GH61475 + df = DataFrame([[1, 2, 3], [3, 4, 5]], columns=["A", "A", "B"]) + msg = "id_vars cannot contain duplicate columns." + + with pytest.raises(ValueError, match=msg): + df.melt(id_vars=["A"], value_vars=["B"]) + class TestLreshape: def test_pairs(self): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 374d236c8ff39..2a58815c1cece 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -15,6 +15,7 @@ import pandas as pd from pandas import ( + ArrowDtype, Categorical, DataFrame, Grouper, @@ -2529,6 +2530,70 @@ def test_pivot_table_aggfunc_nunique_with_different_values(self): tm.assert_frame_equal(result, expected) + def test_pivot_table_index_and_column_keys_with_nan(self, dropna): + # GH#61113 + data = {"row": [None, *range(4)], "col": [*range(4), None], "val": range(5)} + df = DataFrame(data) + result = df.pivot_table(values="val", index="row", columns="col", dropna=dropna) + e_axis = [*range(4), None] + nan = np.nan + e_data = [ + [nan, 1.0, nan, nan, nan], + [nan, nan, 2.0, nan, nan], + [nan, nan, nan, 3.0, nan], + [nan, nan, nan, nan, 4.0], + [0.0, nan, nan, nan, nan], + ] + expected = DataFrame( + data=e_data, + index=Index(data=e_axis, name="row"), + columns=Index(data=e_axis, name="col"), + ) + if dropna: + expected = expected.loc[[0, 1, 2], [1, 2, 3]] + + tm.assert_frame_equal(left=result, right=expected) + + @pytest.mark.parametrize( + "index, columns, e_data, e_index, e_cols", + [ + ( + "Category", + "Value", + [ + [1.0, np.nan, 1.0, np.nan], + [np.nan, 1.0, np.nan, 1.0], + ], + Index(data=["A", "B"], name="Category"), + Index(data=[10, 20, 40, 50], name="Value"), + ), + ( + "Value", + "Category", + [ + [1.0, np.nan], + [np.nan, 1.0], + [1.0, np.nan], + [np.nan, 1.0], + ], + Index(data=[10, 20, 40, 50], name="Value"), + Index(data=["A", "B"], name="Category"), + ), + ], + ids=["values-and-columns", "values-and-index"], + ) + def test_pivot_table_values_as_two_params( + self, index, columns, e_data, e_index, e_cols + ): + # GH#57876 + data = {"Category": ["A", "B", "A", "B"], "Value": [10, 20, 40, 50]} + df = DataFrame(data) + result = df.pivot_table( + index=index, columns=columns, values="Value", aggfunc="count" + ) + expected = DataFrame(data=e_data, index=e_index, columns=e_cols) + tm.assert_frame_equal(result, expected) + class TestPivot: def test_pivot(self): @@ -2827,3 +2892,31 @@ def test_pivot_margins_with_none_index(self): ), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_pivot_with_pyarrow_categorical(self): + # GH#53051 + pa = pytest.importorskip("pyarrow") + + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) + + df = df.pivot(columns=["string_column"], values=["number_column"]) + + multi_index = MultiIndex.from_arrays( + [["number_column", "number_column", "number_column"], ["A", "B", "C"]], + names=(None, "string_column"), + ) + df_expected = DataFrame( + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]], + columns=multi_index, + ) + tm.assert_frame_equal( + df, df_expected, check_dtype=False, check_column_type=False + ) diff --git a/pandas/tests/reshape/test_pivot_multilevel.py b/pandas/tests/reshape/test_pivot_multilevel.py index 2c9d54c3db72c..af70210b37f3c 100644 --- a/pandas/tests/reshape/test_pivot_multilevel.py +++ b/pandas/tests/reshape/test_pivot_multilevel.py @@ -250,3 +250,52 @@ def test_pivot_df_multiindex_index_none(): columns=Index(["label1", "label2"], name="label"), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "index, columns, e_data, e_index, e_cols", + [ + ( + "index", + ["col", "value"], + [ + [50.0, np.nan, 100.0, np.nan], + [np.nan, 100.0, np.nan, 200.0], + ], + Index(data=["A", "B"], name="index"), + MultiIndex.from_arrays( + arrays=[[1, 1, 2, 2], [50, 100, 100, 200]], names=["col", "value"] + ), + ), + ( + ["index", "value"], + "col", + [ + [50.0, np.nan], + [np.nan, 100.0], + [100.0, np.nan], + [np.nan, 200.0], + ], + MultiIndex.from_arrays( + arrays=[["A", "A", "B", "B"], [50, 100, 100, 200]], + names=["index", "value"], + ), + Index(data=[1, 2], name="col"), + ), + ], + ids=["values-and-columns", "values-and-index"], +) +def test_pivot_table_multiindex_values_as_two_params( + index, columns, e_data, e_index, e_cols +): + # GH#61292 + data = [ + ["A", 1, 50, -1], + ["B", 1, 100, -2], + ["A", 2, 100, -2], + ["B", 2, 200, -4], + ] + df = pd.DataFrame(data=data, columns=["index", "col", "value", "extra"]) + result = df.pivot_table(values="value", index=index, columns=columns) + expected = pd.DataFrame(data=e_data, index=e_index, columns=e_cols) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index ce8ea27ea1fa2..f017ccd963972 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -40,7 +40,7 @@ def test_getname_categorical_accessor(self, method): def test_cat_accessor(self): ser = Series(Categorical(["a", "b", np.nan, "a"])) tm.assert_index_equal(ser.cat.categories, Index(["a", "b"])) - assert not ser.cat.ordered, False + assert not ser.cat.ordered exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index ff7f8d0b7fa72..f8ceb67b34af2 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -270,7 +270,7 @@ def test_nan_interpolate(self, kwargs): def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1.0, 2.0, 2.6666666666666665, 4.0], index=[1, 3, 5, 9]) + expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9]) tm.assert_series_equal(result, expected) def test_nan_str_index(self): diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 84b60a2afe6eb..0ec973dea23d5 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -21,6 +21,10 @@ ) import pandas._testing as tm +# The fixture it's mostly used in pandas/tests/apply, so it's defined in that +# conftest, which is out of scope here. So we need to manually import +from pandas.tests.apply.conftest import engine # noqa: F401 + def test_series_map_box_timedelta(): # GH#11349 @@ -32,16 +36,20 @@ def f(x): ser.map(f) -def test_map_callable(datetime_series): +def test_map_callable(datetime_series, engine): # noqa: F811 with np.errstate(all="ignore"): - tm.assert_series_equal(datetime_series.map(np.sqrt), np.sqrt(datetime_series)) + tm.assert_series_equal( + datetime_series.map(np.sqrt, engine=engine), np.sqrt(datetime_series) + ) # map function element-wise - tm.assert_series_equal(datetime_series.map(math.exp), np.exp(datetime_series)) + tm.assert_series_equal( + datetime_series.map(math.exp, engine=engine), np.exp(datetime_series) + ) # empty series s = Series(dtype=object, name="foo", index=Index([], name="bar")) - rs = s.map(lambda x: x) + rs = s.map(lambda x: x, engine=engine) tm.assert_series_equal(s, rs) # check all metadata (GH 9322) @@ -52,7 +60,7 @@ def test_map_callable(datetime_series): # index but no data s = Series(index=[1, 2, 3], dtype=np.float64) - rs = s.map(lambda x: x) + rs = s.map(lambda x: x, engine=engine) tm.assert_series_equal(s, rs) @@ -269,10 +277,10 @@ def test_map_decimal(string_series): assert isinstance(result.iloc[0], Decimal) -def test_map_na_exclusion(): +def test_map_na_exclusion(engine): # noqa: F811 s = Series([1.5, np.nan, 3, np.nan, 5]) - result = s.map(lambda x: x * 2, na_action="ignore") + result = s.map(lambda x: x * 2, na_action="ignore", engine=engine) exp = s * 2 tm.assert_series_equal(result, exp) @@ -604,3 +612,42 @@ def test_map_kwargs(): result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2) expected = Series([4, 6, 7]) tm.assert_series_equal(result, expected) + + +def test_map_arg_as_kwarg(): + with tm.assert_produces_warning( + FutureWarning, match="`arg` has been renamed to `func`" + ): + Series([1, 2]).map(arg={}) + + +def test_map_func_and_arg(): + # `arg`is considered a normal kwarg that should be passed to the function + result = Series([1, 2]).map(lambda _, arg: arg, arg=3) + expected = Series([3, 3]) + tm.assert_series_equal(result, expected) + + +def test_map_no_func_or_arg(): + with pytest.raises(ValueError, match="The `func` parameter is required"): + Series([1, 2]).map() + + +def test_map_func_is_none(): + with pytest.raises(ValueError, match="The `func` parameter is required"): + Series([1, 2]).map(func=None) + + +@pytest.mark.parametrize("func", [{}, {1: 2}, Series([3, 4])]) +def test_map_engine_no_function(func): + s = Series([1, 2]) + + with pytest.raises(ValueError, match="engine argument can only be specified"): + s.map(func, engine="something") + + +def test_map_engine_not_executor(): + s = Series([1, 2]) + + with pytest.raises(ValueError, match="Not a valid engine: 'something'"): + s.map(lambda x: x, engine="something") diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index c330b7a7dfbbb..a78f77e990ae1 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -72,3 +72,10 @@ def test_round_ea_boolean(self): tm.assert_series_equal(result, expected) result.iloc[0] = False tm.assert_series_equal(ser, expected) + + def test_round_dtype_object(self): + # GH#61206 + ser = Series([0.2], dtype="object") + msg = "Expected numeric dtype, got object instead." + with pytest.raises(TypeError, match=msg): + ser.round() diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5f4a100e7ccc7..f82451a2be84d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -632,7 +632,7 @@ def test_constructor_maskedarray_hardened(self): def test_series_ctor_plus_datetimeindex(self): rng = date_range("20090415", "20090519", freq="B") - data = {k: 1 for k in rng} + data = dict.fromkeys(rng, 1) result = Series(data, index=rng) assert result.index.is_(rng) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 76fad35304fe6..d7398ffe259cb 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -4,6 +4,7 @@ import array from functools import partial +import importlib import subprocess import sys @@ -102,7 +103,7 @@ def test_xarray_cftimeindex_nearest(): cftime = pytest.importorskip("cftime") xarray = pytest.importorskip("xarray") - times = xarray.cftime_range("0001", periods=2) + times = xarray.date_range("0001", periods=2, use_cftime=True) key = cftime.DatetimeGregorian(2000, 1, 1) result = times.get_indexer([key], method="nearest") expected = 1 @@ -186,41 +187,21 @@ def test_yaml_dump(df): tm.assert_frame_equal(df, loaded2) -@pytest.mark.single_cpu -def test_missing_required_dependency(): - # GH 23868 - # To ensure proper isolation, we pass these flags - # -S : disable site-packages - # -s : disable user site-packages - # -E : disable PYTHON* env vars, especially PYTHONPATH - # https://github.com/MacPython/pandas-wheels/pull/50 - - pyexe = sys.executable.replace("\\", "/") - - # We skip this test if pandas is installed as a site package. We first - # import the package normally and check the path to the module before - # executing the test which imports pandas with site packages disabled. - call = [pyexe, "-c", "import pandas;print(pandas.__file__)"] - output = subprocess.check_output(call).decode() - if "site-packages" in output: - pytest.skip("pandas installed as site package") - - # This test will fail if pandas is installed as a site package. The flags - # prevent pandas being imported and the test will report Failed: DID NOT - # RAISE - call = [pyexe, "-sSE", "-c", "import pandas"] - - msg = ( - rf"Command '\['{pyexe}', '-sSE', '-c', 'import pandas'\]' " - "returned non-zero exit status 1." - ) +@pytest.mark.parametrize("dependency", ["numpy", "dateutil", "tzdata"]) +def test_missing_required_dependency(monkeypatch, dependency): + # GH#61030, GH61273 + original_import = __import__ + mock_error = ImportError(f"Mock error for {dependency}") + + def mock_import(name, *args, **kwargs): + if name == dependency: + raise mock_error + return original_import(name, *args, **kwargs) - with pytest.raises(subprocess.CalledProcessError, match=msg) as exc: - subprocess.check_output(call, stderr=subprocess.STDOUT) + monkeypatch.setattr("builtins.__import__", mock_import) - output = exc.value.stdout.decode() - for name in ["numpy", "dateutil"]: - assert name in output + with pytest.raises(ImportError, match=dependency): + importlib.reload(importlib.import_module("pandas")) def test_frame_setitem_dask_array_into_new_col(request): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a23e6d9b3973a..ff7ab22c197d8 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -5,6 +5,7 @@ import pandas as pd from pandas import ( + ArrowDtype, DataFrame, MultiIndex, Series, @@ -318,6 +319,34 @@ def test_multiindex_dt_with_nan(self): expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_multiindex_with_pyarrow_categorical(self): + # GH#53051 + pa = pytest.importorskip("pyarrow") + + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) + + df = df.set_index(["string_column", "number_column"]) + + df_expected = DataFrame( + index=MultiIndex.from_arrays( + [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"] + ) + ) + tm.assert_frame_equal( + df, + df_expected, + check_index_type=False, + check_column_type=False, + ) + class TestSorted: """everything you wanted to test about sorting""" diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 616ae36c989be..b02fab70fb825 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3514,6 +3514,54 @@ def test_to_datetime_mixed_not_necessarily_iso8601_coerce(): tm.assert_index_equal(result, DatetimeIndex(["2020-01-01 00:00:00", NaT])) +def test_to_datetime_iso8601_utc_single_naive(): + # GH#61389 + result = to_datetime("2023-10-15T14:30:00", utc=True, format="ISO8601") + expected = Timestamp("2023-10-15 14:30:00+00:00") + assert result == expected + + +def test_to_datetime_iso8601_utc_mixed_negative_offset(): + # GH#61389 + data = ["2023-10-15T10:30:00-12:00", "2023-10-15T14:30:00"] + result = to_datetime(data, utc=True, format="ISO8601") + + expected = DatetimeIndex( + [Timestamp("2023-10-15 22:30:00+00:00"), Timestamp("2023-10-15 14:30:00+00:00")] + ) + tm.assert_index_equal(result, expected) + + +def test_to_datetime_iso8601_utc_mixed_positive_offset(): + # GH#61389 + data = ["2023-10-15T10:30:00+08:00", "2023-10-15T14:30:00"] + result = to_datetime(data, utc=True, format="ISO8601") + + expected = DatetimeIndex( + [Timestamp("2023-10-15 02:30:00+00:00"), Timestamp("2023-10-15 14:30:00+00:00")] + ) + tm.assert_index_equal(result, expected) + + +def test_to_datetime_iso8601_utc_mixed_both_offsets(): + # GH#61389 + data = [ + "2023-10-15T10:30:00+08:00", + "2023-10-15T12:30:00-05:00", + "2023-10-15T14:30:00", + ] + result = to_datetime(data, utc=True, format="ISO8601") + + expected = DatetimeIndex( + [ + Timestamp("2023-10-15 02:30:00+00:00"), + Timestamp("2023-10-15 17:30:00+00:00"), + Timestamp("2023-10-15 14:30:00+00:00"), + ] + ) + tm.assert_index_equal(result, expected) + + def test_unknown_tz_raises(): # GH#18702, GH#51476 dtstr = "2014 Jan 9 05:15 FAKE" diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index ffe6ff0b51bcf..054fc07e4180f 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -353,3 +353,111 @@ def test_holidays_with_timezone_specified_but_no_occurences(): expected_results.index = expected_results.index.as_unit("ns") tm.assert_equal(test_case, expected_results) + + +def test_holiday_with_exclusion(): + # GH 54382 + start = Timestamp("2020-05-01") + end = Timestamp("2025-05-31") + exclude = DatetimeIndex([Timestamp("2022-05-30")]) # Queen's platinum Jubilee + + queens_jubilee_uk_spring_bank_holiday: Holiday = Holiday( + "Queen's Jubilee UK Spring Bank Holiday", + month=5, + day=31, + offset=DateOffset(weekday=MO(-1)), + exclude_dates=exclude, + ) + + result = queens_jubilee_uk_spring_bank_holiday.dates(start, end) + expected = DatetimeIndex( + [ + Timestamp("2020-05-25"), + Timestamp("2021-05-31"), + Timestamp("2023-05-29"), + Timestamp("2024-05-27"), + Timestamp("2025-05-26"), + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + + +def test_holiday_with_multiple_exclusions(): + start = Timestamp("2025-01-01") + end = Timestamp("2065-12-31") + exclude = DatetimeIndex( + [ + Timestamp("2025-01-01"), + Timestamp("2042-01-01"), + Timestamp("2061-01-01"), + ] + ) # Yakudoshi new year + + yakudoshi_new_year: Holiday = Holiday( + "Yakudoshi New Year", month=1, day=1, exclude_dates=exclude + ) + + result = yakudoshi_new_year.dates(start, end) + expected = DatetimeIndex( + [ + Timestamp("2026-01-01"), + Timestamp("2027-01-01"), + Timestamp("2028-01-01"), + Timestamp("2029-01-01"), + Timestamp("2030-01-01"), + Timestamp("2031-01-01"), + Timestamp("2032-01-01"), + Timestamp("2033-01-01"), + Timestamp("2034-01-01"), + Timestamp("2035-01-01"), + Timestamp("2036-01-01"), + Timestamp("2037-01-01"), + Timestamp("2038-01-01"), + Timestamp("2039-01-01"), + Timestamp("2040-01-01"), + Timestamp("2041-01-01"), + Timestamp("2043-01-01"), + Timestamp("2044-01-01"), + Timestamp("2045-01-01"), + Timestamp("2046-01-01"), + Timestamp("2047-01-01"), + Timestamp("2048-01-01"), + Timestamp("2049-01-01"), + Timestamp("2050-01-01"), + Timestamp("2051-01-01"), + Timestamp("2052-01-01"), + Timestamp("2053-01-01"), + Timestamp("2054-01-01"), + Timestamp("2055-01-01"), + Timestamp("2056-01-01"), + Timestamp("2057-01-01"), + Timestamp("2058-01-01"), + Timestamp("2059-01-01"), + Timestamp("2060-01-01"), + Timestamp("2062-01-01"), + Timestamp("2063-01-01"), + Timestamp("2064-01-01"), + Timestamp("2065-01-01"), + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + + +def test_exclude_date_value_error(): + msg = "exclude_dates must be None or of type DatetimeIndex." + + with pytest.raises(ValueError, match=msg): + exclude = [ + Timestamp("2025-06-10"), + Timestamp("2026-06-10"), + ] + Holiday("National Ice Tea Day", month=6, day=10, exclude_dates=exclude) + + +def test_days_of_week_value_error(): + msg = "days_of_week must be None or tuple." + + with pytest.raises(ValueError, match=msg): + Holiday("World Blood Donor Day", month=6, day=14, days_of_week=[0, 1]) diff --git a/pandas/tests/tseries/offsets/test_easter.py b/pandas/tests/tseries/offsets/test_easter.py index ada72d94434a3..309411ceb5be2 100644 --- a/pandas/tests/tseries/offsets/test_easter.py +++ b/pandas/tests/tseries/offsets/test_easter.py @@ -7,6 +7,10 @@ from datetime import datetime +from dateutil.easter import ( + EASTER_ORTHODOX, + EASTER_WESTERN, +) import pytest from pandas.tests.tseries.offsets.common import assert_offset_equal @@ -32,3 +36,115 @@ class TestEaster: ) def test_offset(self, offset, date, expected): assert_offset_equal(offset, date, expected) + + @pytest.mark.parametrize( + "offset,date,expected", + [ + (Easter(method=EASTER_WESTERN), datetime(2010, 1, 1), datetime(2010, 4, 4)), + ( + Easter(method=EASTER_WESTERN), + datetime(2010, 4, 5), + datetime(2011, 4, 24), + ), + ( + Easter(2, method=EASTER_WESTERN), + datetime(2010, 1, 1), + datetime(2011, 4, 24), + ), + ( + Easter(method=EASTER_WESTERN), + datetime(2010, 4, 4), + datetime(2011, 4, 24), + ), + ( + Easter(2, method=EASTER_WESTERN), + datetime(2010, 4, 4), + datetime(2012, 4, 8), + ), + ( + -Easter(method=EASTER_WESTERN), + datetime(2011, 1, 1), + datetime(2010, 4, 4), + ), + ( + -Easter(method=EASTER_WESTERN), + datetime(2010, 4, 5), + datetime(2010, 4, 4), + ), + ( + -Easter(2, method=EASTER_WESTERN), + datetime(2011, 1, 1), + datetime(2009, 4, 12), + ), + ( + -Easter(method=EASTER_WESTERN), + datetime(2010, 4, 4), + datetime(2009, 4, 12), + ), + ( + -Easter(2, method=EASTER_WESTERN), + datetime(2010, 4, 4), + datetime(2008, 3, 23), + ), + ], + ) + def test_western_easter_offset(self, offset, date, expected): + assert_offset_equal(offset, date, expected) + + @pytest.mark.parametrize( + "offset,date,expected", + [ + ( + Easter(method=EASTER_ORTHODOX), + datetime(2010, 1, 1), + datetime(2010, 4, 4), + ), + ( + Easter(method=EASTER_ORTHODOX), + datetime(2010, 4, 5), + datetime(2011, 4, 24), + ), + ( + Easter(2, method=EASTER_ORTHODOX), + datetime(2010, 1, 1), + datetime(2011, 4, 24), + ), + ( + Easter(method=EASTER_ORTHODOX), + datetime(2010, 4, 4), + datetime(2011, 4, 24), + ), + ( + Easter(2, method=EASTER_ORTHODOX), + datetime(2010, 4, 4), + datetime(2012, 4, 15), + ), + ( + -Easter(method=EASTER_ORTHODOX), + datetime(2011, 1, 1), + datetime(2010, 4, 4), + ), + ( + -Easter(method=EASTER_ORTHODOX), + datetime(2010, 4, 5), + datetime(2010, 4, 4), + ), + ( + -Easter(2, method=EASTER_ORTHODOX), + datetime(2011, 1, 1), + datetime(2009, 4, 19), + ), + ( + -Easter(method=EASTER_ORTHODOX), + datetime(2010, 4, 4), + datetime(2009, 4, 19), + ), + ( + -Easter(2, method=EASTER_ORTHODOX), + datetime(2010, 4, 4), + datetime(2008, 4, 27), + ), + ], + ) + def test_orthodox_easter_offset(self, offset, date, expected): + assert_offset_equal(offset, date, expected) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index f5c2c06162fcb..0b2e66a2b3a0d 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -239,7 +239,7 @@ def test_offset_freqstr(self, offset_types): offset = _create_offset(offset_types) freqstr = offset.freqstr - if freqstr not in ("", "", "LWOM-SAT"): + if freqstr not in ("", "", "LWOM-SAT"): code = _get_offset(freqstr) assert offset.rule_code == code diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 67521c7e2a3ac..add9213ae59fb 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -207,8 +207,10 @@ def test_to_offset_lowercase_frequency_raises(freq_depr): @pytest.mark.parametrize("freq_depr", ["2MIN", "2Us", "2NS"]) def test_to_offset_uppercase_frequency_deprecated(freq_depr): # GH#54939 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version, please use '{freq_depr.lower()[1:]}' instead." + depr_msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + ) with tm.assert_produces_warning(FutureWarning, match=depr_msg): to_offset(freq_depr) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 887aeca6590dc..ff6a616bc5264 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -12,6 +12,7 @@ to_datetime, ) import pandas._testing as tm +from pandas.api.indexers import BaseIndexer from pandas.util.version import Version pytestmark = [pytest.mark.single_cpu] @@ -581,3 +582,67 @@ def test_npfunc_no_warnings(): df = DataFrame({"col1": [1, 2, 3, 4, 5]}) with tm.assert_produces_warning(False): df.col1.rolling(2).apply(np.prod, raw=True, engine="numba") + + +class PrescribedWindowIndexer(BaseIndexer): + def __init__(self, start, end): + self._start = start + self._end = end + super().__init__() + + def get_window_bounds( + self, num_values=None, min_periods=None, center=None, closed=None, step=None + ): + if num_values is None: + num_values = len(self._start) + start = np.clip(self._start, 0, num_values) + end = np.clip(self._end, 0, num_values) + return start, end + + +@td.skip_if_no("numba") +class TestMinMaxNumba: + @pytest.mark.parametrize( + "is_max, has_nan, exp_list", + [ + (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]), + (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]), + (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]), + (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]), + ], + ) + def test_minmax(self, is_max, has_nan, exp_list): + nan_idx = [0, 5, 8] + df = DataFrame( + { + "data": [5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 6.0, 7.0, 8.0, 9.0], + "start": [2, 0, 3, 0, 4, 0, 5, 5, 7, 3], + "end": [3, 4, 4, 5, 5, 6, 7, 8, 9, 10], + } + ) + if has_nan: + df.loc[nan_idx, "data"] = np.nan + expected = Series(exp_list, name="data") + r = df.data.rolling( + PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy()) + ) + if is_max: + result = r.max(engine="numba") + else: + result = r.min(engine="numba") + + tm.assert_series_equal(result, expected) + + def test_wrong_order(self): + start = np.array(range(5), dtype=np.int64) + end = start + 1 + end[3] = end[2] + start[3] = start[2] - 1 + + df = DataFrame({"data": start * 1.0, "start": start, "end": end}) + + r = df.data.rolling(PrescribedWindowIndexer(start, end)) + with pytest.raises( + ValueError, match="Start/End ordering requirement is violated at index 3" + ): + r.max(engine="numba") diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 8c57781c1447c..6e8f075d35490 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1946,3 +1946,66 @@ def test_rolling_timedelta_window_non_nanoseconds(unit, tz): df.index = df.index.as_unit("ns") tm.assert_frame_equal(ref_df, df) + + +class PrescribedWindowIndexer(BaseIndexer): + def __init__(self, start, end): + self._start = start + self._end = end + super().__init__() + + def get_window_bounds( + self, num_values=None, min_periods=None, center=None, closed=None, step=None + ): + if num_values is None: + num_values = len(self._start) + start = np.clip(self._start, 0, num_values) + end = np.clip(self._end, 0, num_values) + return start, end + + +class TestMinMax: + @pytest.mark.parametrize( + "is_max, has_nan, exp_list", + [ + (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]), + (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]), + (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]), + (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]), + ], + ) + def test_minmax(self, is_max, has_nan, exp_list): + nan_idx = [0, 5, 8] + df = DataFrame( + { + "data": [5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 6.0, 7.0, 8.0, 9.0], + "start": [2, 0, 3, 0, 4, 0, 5, 5, 7, 3], + "end": [3, 4, 4, 5, 5, 6, 7, 8, 9, 10], + } + ) + if has_nan: + df.loc[nan_idx, "data"] = np.nan + expected = Series(exp_list, name="data") + r = df.data.rolling( + PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy()) + ) + if is_max: + result = r.max() + else: + result = r.min() + + tm.assert_series_equal(result, expected) + + def test_wrong_order(self): + start = np.array(range(5), dtype=np.int64) + end = start + 1 + end[3] = end[2] + start[3] = start[2] - 1 + + df = DataFrame({"data": start * 1.0, "start": start, "end": end}) + + r = df.data.rolling(PrescribedWindowIndexer(start, end)) + with pytest.raises( + ValueError, match="Start/End ordering requirement is violated at index 3" + ): + r.max() diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 2d195fbbc4e84..e61fd6594ea84 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -169,6 +169,7 @@ def __init__( start_date=None, end_date=None, days_of_week: tuple | None = None, + exclude_dates: DatetimeIndex | None = None, ) -> None: """ Parameters @@ -191,8 +192,11 @@ class from pandas.tseries.offsets, default None end_date : datetime-like, default None Last date the holiday is observed days_of_week : tuple of int or dateutil.relativedelta weekday strs, default None - Provide a tuple of days e.g (0,1,2,3,) for Monday Through Thursday + Provide a tuple of days e.g (0,1,2,3,) for Monday through Thursday Monday=0,..,Sunday=6 + Only instances of the holiday included in days_of_week will be computed + exclude_dates : DatetimeIndex or default None + Specific dates to exclude e.g. skipping a specific year's holiday Examples -------- @@ -255,8 +259,12 @@ class from pandas.tseries.offsets, default None ) self.end_date = Timestamp(end_date) if end_date is not None else end_date self.observance = observance - assert days_of_week is None or type(days_of_week) == tuple + if not (days_of_week is None or isinstance(days_of_week, tuple)): + raise ValueError("days_of_week must be None or tuple.") self.days_of_week = days_of_week + if not (exclude_dates is None or isinstance(exclude_dates, DatetimeIndex)): + raise ValueError("exclude_dates must be None or of type DatetimeIndex.") + self.exclude_dates = exclude_dates def __repr__(self) -> str: info = "" @@ -328,6 +336,9 @@ def dates( holiday_dates = holiday_dates[ (holiday_dates >= filter_start_date) & (holiday_dates <= filter_end_date) ] + + if self.exclude_dates is not None: + holiday_dates = holiday_dates.difference(self.exclude_dates) if return_name: return Series(self.name, index=holiday_dates) return holiday_dates diff --git a/pyproject.toml b/pyproject.toml index b7d53b0d8934a..b17a1eacfa717 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "meson-python>=0.13.1", "meson>=1.2.1,<2", "wheel", - "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json + "Cython<4.0.0a0", # Note: sync with setup.py, environment.yml and asv.conf.json # Force numpy higher than 2.0rc1, so that built wheels are compatible # with both numpy 1 and 2 "numpy>=2.0.0rc1", @@ -60,66 +60,64 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] pyarrow = ['pyarrow>=10.0.1'] -performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] -computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] -fss = ['fsspec>=2022.11.0'] -aws = ['s3fs>=2022.11.0'] -gcp = ['gcsfs>=2022.11.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.5'] +performance = ['bottleneck>=1.3.6', 'numba>=0.59.0', 'numexpr>=2.9.0'] +computation = ['scipy>=1.12.0', 'xarray>=2024.1.1'] +fss = ['fsspec>=2023.12.2'] +aws = ['s3fs>=2023.12.2'] +gcp = ['gcsfs>=2023.12.2'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.2', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.2.0'] parquet = ['pyarrow>=10.0.1'] feather = ['pyarrow>=10.0.1'] -hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) - #'blosc>=1.20.1', - 'tables>=3.8.0'] -spss = ['pyreadstat>=1.2.0'] +iceberg = ['pyiceberg>=0.7.1'] +hdf5 = ['tables>=3.8.0'] +spss = ['pyreadstat>=1.2.6'] postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0'] -mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2'] +mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.1.0'] sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0'] -html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2'] +html = ['beautifulsoup4>=4.12.3', 'html5lib>=1.1', 'lxml>=4.9.2'] xml = ['lxml>=4.9.2'] -plot = ['matplotlib>=3.6.3'] -output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0'] +plot = ['matplotlib>=3.8.3'] +output-formatting = ['jinja2>=3.1.3', 'tabulate>=0.9.0'] clipboard = ['PyQt5>=5.15.9', 'qtpy>=2.3.0'] -compression = ['zstandard>=0.19.0'] +compression = ['zstandard>=0.22.0'] timezone = ['pytz>=2023.4'] all = ['adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0', - 'beautifulsoup4>=4.11.2', - # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) - #'blosc>=1.21.3', + 'beautifulsoup4>=4.12.3', 'bottleneck>=1.3.6', - 'fastparquet>=2023.10.0', - 'fsspec>=2022.11.0', - 'gcsfs>=2022.11.0', + 'fastparquet>=2024.2.0', + 'fsspec>=2023.12.2', + 'gcsfs>=2023.12.2', 'html5lib>=1.1', 'hypothesis>=6.84.0', - 'jinja2>=3.1.2', + 'jinja2>=3.1.3', 'lxml>=4.9.2', - 'matplotlib>=3.6.3', - 'numba>=0.56.4', - 'numexpr>=2.8.4', + 'matplotlib>=3.8.3', + 'numba>=0.59.0', + 'numexpr>=2.9.0', 'odfpy>=1.4.1', - 'openpyxl>=3.1.0', + 'openpyxl>=3.1.2', 'psycopg2>=2.9.6', 'pyarrow>=10.0.1', - 'pymysql>=1.0.2', + 'pyiceberg>=0.7.1', + 'pymysql>=1.1.0', 'PyQt5>=5.15.9', - 'pyreadstat>=1.2.0', + 'pyreadstat>=1.2.6', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0', 'python-calamine>=0.1.7', 'pytz>=2023.4', 'pyxlsb>=1.0.10', 'qtpy>=2.3.0', - 'scipy>=1.10.0', - 's3fs>=2022.11.0', + 'scipy>=1.12.0', + 's3fs>=2023.12.2', 'SQLAlchemy>=2.0.0', 'tables>=3.8.0', 'tabulate>=0.9.0', - 'xarray>=2022.12.0', + 'xarray>=2024.1.1', 'xlrd>=2.0.1', - 'xlsxwriter>=3.0.5', - 'zstandard>=0.19.0'] + 'xlsxwriter>=3.2.0', + 'zstandard>=0.22.0'] # TODO: Remove after setuptools support is dropped. [tool.setuptools] @@ -148,7 +146,7 @@ setup = ['--vsenv'] # For Windows [tool.cibuildwheel] skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x" -build-verbosity = "3" +build-verbosity = 3 environment = {LDFLAGS="-Wl,--strip-all"} test-requires = "hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0" test-command = """ @@ -156,12 +154,12 @@ test-command = """ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ -free-threaded-support = true +enable = ["cpython-freethreading"] before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh" [tool.cibuildwheel.windows] -before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build_windows.sh" -before-test = "bash {package}/scripts/cibw_before_test_windows.sh" +environment = {} +before-build = "pip install delvewheel" test-command = """ set PANDAS_CI='1' && \ python -c "import pandas as pd; \ @@ -234,8 +232,8 @@ select = [ "TID", # implicit string concatenation "ISC", - # type-checking imports - "TCH", + # flake8-type-checking + "TC", # comprehensions "C4", # pygrep-hooks @@ -390,6 +388,8 @@ ignore = [ "PLW0108", # global-statement "PLW0603", + # runtime-cast-value + "TC006", ] exclude = [ @@ -429,7 +429,7 @@ exclude = [ "pandas/tests/*" = ["B028", "FLY"] "scripts/*" = ["B028"] # Keep this one enabled -"pandas/_typing.py" = ["TCH"] +"pandas/_typing.py" = ["TC"] [tool.ruff.lint.flake8-pytest-style] fixture-parentheses = false diff --git a/requirements-dev.txt b/requirements-dev.txt index 990901958cd9e..6515797bc3b9d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,7 +3,7 @@ pip versioneer[toml] -cython~=3.0.5 +cython<4.0.0a0 meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 @@ -14,41 +14,40 @@ pytest-localserver PyQt5>=5.15.9 coverage python-dateutil -numpy<2 -beautifulsoup4>=4.11.2 -blosc +numpy<3 +beautifulsoup4>=4.12.3 bottleneck>=1.3.6 -fastparquet>=2023.10.0 -fsspec>=2022.11.0 +fastparquet>=2024.2.0 +fsspec>=2023.12.2 html5lib>=1.1 hypothesis>=6.84.0 -gcsfs>=2022.11.0 +gcsfs>=2023.12.2 ipython pickleshare -jinja2>=3.1.2 +jinja2>=3.1.3 lxml>=4.9.2 -matplotlib>=3.6.3 -numba>=0.56.4 -numexpr>=2.8.4 -openpyxl>=3.1.0 +matplotlib>=3.8.3 +numba>=0.59.0 +numexpr>=2.9.0 +openpyxl>=3.1.2 odfpy>=1.4.1 -py psycopg2-binary>=2.9.6 pyarrow>=10.0.1 -pymysql>=1.0.2 -pyreadstat>=1.2.0 +pyiceberg>=0.7.1 +pymysql>=1.1.0 +pyreadstat>=1.2.6 tables>=3.8.0 python-calamine>=0.1.7 pytz>=2023.4 pyxlsb>=1.0.10 -s3fs>=2022.11.0 -scipy>=1.10.0 +s3fs>=2023.12.2 +scipy>=1.12.0 SQLAlchemy>=2.0.0 tabulate>=0.9.0 -xarray>=2022.12.0, <=2024.9.0 +xarray>=2024.1.1 xlrd>=2.0.1 -xlsxwriter>=3.0.5 -zstandard>=0.19.0 +xlsxwriter>=3.2.0 +zstandard>=0.22.0 dask seaborn moto @@ -57,10 +56,8 @@ asv>=0.6.1 flake8==7.1.0 mypy==1.13.0 tokenize-rt -pre-commit>=4.0.1 +pre-commit>=4.2.0 gitpython -gitdb -google-auth natsort numpydoc pydata-sphinx-theme==0.16 diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index d326dd3637314..274848972bd7e 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -3,11 +3,3 @@ for file in $PACKAGE_DIR/LICENSES/*; do cat $file >> $PACKAGE_DIR/LICENSE done - -# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13. -FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" -if [[ $FREE_THREADED_BUILD == "True" ]]; then - python -m pip install -U pip - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython - python -m pip install numpy ninja meson-python versioneer[toml] -fi diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh deleted file mode 100644 index f9e1e68d8efba..0000000000000 --- a/scripts/cibw_before_build_windows.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# Add 3rd party licenses, like numpy does -for file in $PACKAGE_DIR/LICENSES/*; do - cat $file >> $PACKAGE_DIR/LICENSE -done - -# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13 -# and a NumPy Windows wheel for the free-threaded build on PyPI. -FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" -if [[ $FREE_THREADED_BUILD == "True" ]]; then - python -m pip install -U pip - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython - python -m pip install ninja meson-python versioneer[toml] -fi diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh deleted file mode 100644 index 8878e3950452f..0000000000000 --- a/scripts/cibw_before_test_windows.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -# TODO: Delete when there's a NumPy Windows wheel for the free-threaded build on PyPI. -FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" -if [[ $FREE_THREADED_BUILD == "True" ]]; then - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy -fi diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py index df88c61061f12..e87a7d53f4ff3 100644 --- a/scripts/run_stubtest.py +++ b/scripts/run_stubtest.py @@ -85,9 +85,11 @@ ] # create allowlist - with tempfile.NamedTemporaryFile(mode="w+t") as allow: - allow.write("\n".join(_ALLOWLIST)) - allow.flush() + with tempfile.TemporaryDirectory() as td: + allow = os.path.join(td, "test") + with open(allow, "w+t") as allow: + allow.write("\n".join(_ALLOWLIST)) + allow.flush() args = pyi_modules + [ "--ignore-missing-stub", diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index d1db7989a95a4..d4ecd9f64a68d 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -23,7 +23,6 @@ dependencies: # optional dependencies - beautifulsoup4>=5.9.3 - - blosc - bottleneck>=1.3.2 - fastparquet>=0.6.3 - fsspec>=2021.07.0 @@ -39,7 +38,7 @@ dependencies: - odfpy>=1.4.1 - psycopg2>=2.8.6 - pyarrow<11, >=7.0.0 - - pymysql>=1.0.2 + - pymysql>=1.1.0 - pyreadstat>=1.1.2 - pytables>=3.6.1 - python-calamine>=0.1.7 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index 0a53225a5d995..21c269f573b3d 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -63,12 +63,10 @@ gcp = ['gcsfs>=2021.07.0'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] -hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) - #'blosc>=1.20.1', - 'tables>=3.6.1'] +hdf5 = ['tables>=3.6.1'] spss = ['pyreadstat>=1.1.2'] postgresql = ['SQLAlchemy>=1.4.16', 'psycopg2>=2.8.6'] -mysql = ['SQLAlchemy>=1.4.16', 'pymysql>=1.0.2'] +mysql = ['SQLAlchemy>=1.4.16', 'pymysql>=1.1.0'] sql-other = ['SQLAlchemy>=1.4.16'] html = ['beautifulsoup4>=4.9.3', 'html5lib>=1.1', 'lxml>=4.6.3'] xml = ['lxml>=4.6.3'] @@ -77,8 +75,6 @@ output_formatting = ['jinja2>=3.0.0', 'tabulate>=0.8.9'] clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.3.0'] compression = ['zstandard>=0.15.2'] all = ['beautifulsoup4>=5.9.3', - # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) - #'blosc>=1.21.0', 'bottleneck>=1.3.2', 'fastparquet>=0.6.3', 'fsspec>=2021.07.0', @@ -94,7 +90,7 @@ all = ['beautifulsoup4>=5.9.3', 'openpyxl>=3.0.7', 'psycopg2>=2.8.6', 'pyarrow>=7.0.0', - 'pymysql>=1.0.2', + 'pymysql>=1.1.0', 'PyQt5>=5.15.1', 'pyreadstat>=1.1.2', 'pytest>=7.3.2', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index afb28dd2c08bb..4b0f4ffb51b92 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -23,7 +23,6 @@ dependencies: # optional dependencies - beautifulsoup4 - - blosc - bottleneck>=1.3.2 - fastparquet>=0.6.3 - fsspec>=2021.07.0 @@ -39,7 +38,7 @@ dependencies: - odfpy>=1.4.1 - psycopg2 - pyarrow<11, >=7.0.0 - - pymysql>=1.0.2 + - pymysql>=1.1.0 - pyreadstat>=1.1.2 - pytables>=3.6.1 - python-calamine>=0.1.7 diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 1001b00450354..7908aaef3d890 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -36,7 +36,7 @@ SETUP_PATH = pathlib.Path("pyproject.toml").resolve() YAML_PATH = pathlib.Path("ci/deps") ENV_PATH = pathlib.Path("environment.yml") -EXCLUDE_DEPS = {"tzdata", "blosc", "pyqt", "pyqt5"} +EXCLUDE_DEPS = {"tzdata", "pyqt", "pyqt5"} EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"]) # pandas package is not available # in pre-commit environment diff --git a/setup.py b/setup.py index 737ebd270d1e4..db1852b43cfa9 100755 --- a/setup.py +++ b/setup.py @@ -364,7 +364,7 @@ def run(self) -> None: # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled -linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False) +linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False) # noqa: PLW1508 if "--with-cython-coverage" in sys.argv: linetrace = True sys.argv.remove("--with-cython-coverage") diff --git a/web/pandas/about/roadmap.md b/web/pandas/about/roadmap.md index aba95ec2c03fc..278143c01e7dc 100644 --- a/web/pandas/about/roadmap.md +++ b/web/pandas/about/roadmap.md @@ -58,27 +58,6 @@ library, making their behavior more consistent with the handling of NumPy arrays. We'll do this by cleaning up pandas' internals and adding new methods to the extension array interface. -### String data type - -Currently, pandas stores text data in an `object` -dtype NumPy array. -The current implementation has two primary drawbacks: First, `object` --dtype is not specific to strings: any Python object can be stored in an -`object` -dtype array, not just strings. Second: this is not efficient. -The NumPy memory model isn't especially well-suited to variable width -text data. - -To solve the first issue, we propose a new extension type for string -data. This will initially be opt-in, with users explicitly requesting -`dtype="string"`. The array backing this string dtype may initially be -the current implementation: an `object` -dtype NumPy array of Python -strings. - -To solve the second issue (performance), we'll explore alternative -in-memory array libraries (for example, Apache Arrow). As part of the -work, we may need to implement certain operations expected by pandas -users (for example the algorithm used in, `Series.str.upper`). That work -may be done outside of pandas. - ### Apache Arrow interoperability [Apache Arrow](https://arrow.apache.org) is a cross-language development diff --git a/web/pandas/community/benchmarks.md b/web/pandas/community/benchmarks.md index 1e63832a5a2ba..0844e81b0166b 100644 --- a/web/pandas/community/benchmarks.md +++ b/web/pandas/community/benchmarks.md @@ -11,7 +11,7 @@ kinds of benchmarks relevant to pandas: pandas benchmarks are implemented in the [asv_bench](https://github.com/pandas-dev/pandas/tree/main/asv_bench) directory of our repository. The benchmarks are implemented for the -[airspeed velocity](https://asv.readthedocs.io/en/v0.6.1/) (asv for short) framework. +[airspeed velocity](https://asv.readthedocs.io/en/latest/) (asv for short) framework. The benchmarks can be run locally by any pandas developer. This can be done with the `asv run` command, and it can be useful to detect if local changes have @@ -22,54 +22,15 @@ More information on running the performance test suite is found Note that benchmarks are not deterministic, and running in different hardware or running in the same hardware with different levels of stress have a big impact in the result. Even running the benchmarks with identical hardware and almost identical -conditions produces significant differences when running the same exact code. +conditions can produce significant differences when running the same exact code. -## pandas benchmarks servers +## Automated benchmark runner -We currently have two physical servers running the benchmarks of pandas for every -(or almost every) commit to the `main` branch. The servers run independently from -each other. The original server has been running for a long time, and it is physically -located with one of the pandas maintainers. The newer server is in a datacenter -kindly sponsored by [OVHCloud](https://www.ovhcloud.com/). More information about -pandas sponsors, and how your company can support the development of pandas is -available at the [pandas sponsors]({{ base_url }}about/sponsors.html) page. +The [asv-runner](https://github.com/pandas-dev/asv-runner/) repository automatically runs the pandas asv benchmark suite +for every (or almost every) commit to the `main` branch. It is run on GitHub actions. +See the linked repository for more details. The results are available at: -Results of the benchmarks are available at: - -- Original server: [asv](https://asv-runner.github.io/asv-collection/pandas/) -- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/) (benchmarks results can - also be visualized in this [Conbench PoC](http://57.128.112.95:5000/) - -### Original server configuration - -The machine can be configured with the Ansible playbook in -[tomaugspurger/asv-runner](https://github.com/tomaugspurger/asv-runner). -The results are published to another GitHub repository, -[tomaugspurger/asv-collection](https://github.com/tomaugspurger/asv-collection). - -The benchmarks are scheduled by [Airflow](https://airflow.apache.org/). -It has a dashboard for viewing and debugging the results. -You’ll need to setup an SSH tunnel to view them: - -``` -ssh -L 8080:localhost:8080 pandas@panda.likescandy.com -``` - -### OVH server configuration - -The server used to run the benchmarks has been configured to reduce system -noise and maximize the stability of the benchmarks times. - -The details on how the server is configured can be found in the -[pandas-benchmarks repository](https://github.com/pandas-dev/pandas-benchmarks). -There is a quick summary here: - -- CPU isolation: Avoid user space tasks to execute in the same CPU as benchmarks, possibly interrupting them during the execution (include all virtual CPUs using a physical core) -- NoHZ: Stop the kernel tick that enables context switching in the isolated CPU -- IRQ affinity: Ban benchmarks CPU to avoid many (but not all) kernel interruption in the isolated CPU -- TurboBoost: Disable CPU scaling based on high CPU demand -- P-States: Use "performance" governor to disable P-States and CPU frequency changes based on them -- C-States: Set C-State to 0 and disable changes to avoid slower CPU after system inactivity +[https://pandas-dev.github.io/asv-runner/](https://pandas-dev.github.io/asv-runner/) ## Community benchmarks diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 74b7c1f4884a1..78c239ac4f690 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -1,66 +1,99 @@ # Ecosystem -Increasingly, packages are being built on top of pandas to address -specific needs in data preparation, analysis and visualization. This is -encouraging because it means pandas is not only helping users to handle -their data tasks but also that it provides a better starting point for -developers to build powerful and more focused data tools. The creation -of libraries that complement pandas' functionality also allows pandas -development to remain focused around its original requirements. +[TOC] -This is a community-maintained list of projects that build on pandas in order -to provide tools in the PyData space. The pandas core development team does not necessarily endorse any particular project on this list or have any knowledge of the maintenance status of any particular library. +This is a community-maintained list of projects that build on pandas in order to provide tools +in the PyData space. The pandas core development team does not necessarily endorse any particular +project on this list or have any knowledge of the maintenance status of any particular library. -For a more complete list of projects that depend on pandas, see the [libraries.io usage page for -pandas](https://libraries.io/pypi/pandas/usage) or [search pypi for -pandas](https://pypi.org/search/?q=pandas). +## Extensions -We'd like to make it easier for users to find these projects, if you -know of other substantial projects that you feel should be on this list, -please let us know. +pandas has different ways to allow third-party packages to enhance its +functionality. This section contains a list of known projects that +extend pandas functionality. -## Statistics and machine learning +Developers who want to extend pandas can find more information in the +[Extending pandas](https://pandas.pydata.org/docs/dev/development/extending.html) +page in our documentation. -### [Statsmodels](https://www.statsmodels.org/) +### Accessors -Statsmodels is the prominent Python "statistics and econometrics -library" and it has a long-standing special relationship with pandas. -Statsmodels provides powerful statistics, econometrics, analysis and -modeling functionality that is out of pandas' scope. Statsmodels -leverages pandas objects as the underlying data container for -computation. +A directory of projects providing +[extension accessors](https://pandas.pydata.org/docs/development/extending.html#registering-custom-accessors). +This is for users to discover new accessors and for library +authors to coordinate on the namespace. -### [skrub](https://skrub-data.org) + | Library | Accessor | Classes | + | -------------------------------------------------------------------- | ---------- | --------------------- | + | [awkward-pandas](https://awkward-pandas.readthedocs.io/en/latest/) | `ak` | `Series` | + | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | + | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` | + | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | + | [physipandas](https://github.com/mocquin/physipandas) | `physipy` | `Series`, `DataFrame` | + | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | + | [gurobipy-pandas](https://github.com/Gurobi/gurobipy-pandas) | `gppd` | `Series`, `DataFrame` | + | [staircase](https://www.staircase.dev/) | `sc` | `Series`, `DataFrame` | + | [woodwork](https://github.com/alteryx/woodwork) | `slice` | `Series`, `DataFrame` | -Skrub facilitates machine learning on dataframes. It bridges pandas -to scikit-learn and related. In particular it facilitates building -features from dataframes. +### Data types -### [Featuretools](https://github.com/alteryx/featuretools/) +Pandas provides an interface for defining +[extension types](https://pandas.pydata.org/docs/development/extending.html#extension-types) to extend NumPy's type system. +The following libraries implement that interface to provide types not found in NumPy or pandas, +which work well with pandas' data containers. -Featuretools is a Python library for automated feature engineering built -on top of pandas. It excels at transforming temporal and relational -datasets into feature matrices for machine learning using reusable -feature engineering "primitives". Users can contribute their own -primitives in Python and share them with the rest of the community. +#### [awkward-pandas](https://github.com/scikit-hep/awkward) -### [Compose](https://github.com/alteryx/compose) +Awkward-pandas provides an extension type for storing [Awkward +Arrays](https://awkward-array.org/) inside pandas' Series and +DataFrame. It also provides an accessor for using awkward functions +on Series that are of awkward type. -Compose is a machine learning tool for labeling data and prediction engineering. -It allows you to structure the labeling process by parameterizing -prediction problems and transforming time-driven relational data into -target values with cutoff times that can be used for supervised learning. +#### [db-dtypes](https://github.com/googleapis/python-db-dtypes-pandas) -### [STUMPY](https://github.com/TDAmeritrade/stumpy) +db-dtypes provides an extension types for working with types like +DATE, TIME, and JSON from database systems. This package is used +by pandas-gbq to provide natural dtypes for BigQuery data types without +a natural numpy type. -STUMPY is a powerful and scalable Python library for modern time series analysis. -At its core, STUMPY efficiently computes something called a -[matrix profile](https://stumpy.readthedocs.io/en/latest/Tutorial_The_Matrix_Profile.html), -which can be used for a wide variety of time series data mining tasks. +#### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/) + +Pandas-Genomics provides an extension type and extension array for working + with genomics data. It also includes `genomics` accessors for many useful properties + and methods related to QC and analysis of genomics data. + +#### [Physipandas](https://github.com/mocquin/physipandas) + +Physipandas provides an extension for manipulating physical quantities + (like scalar and numpy.ndarray) in association with a physical unit + (like meter or joule) and additional features for integration of + `physipy` accessors with pandas Series and Dataframe. + +#### [Pint-Pandas](https://github.com/hgrecco/pint-pandas) + +Pint-Pandas provides an extension type for storing numeric arrays with units. +These arrays can be stored inside pandas' Series and DataFrame. Operations +between Series and DataFrame columns which use pint's extension array are then +units aware. + +#### [Text Extensions](https://ibm.biz/text-extensions-for-pandas) + +Text Extensions for Pandas provides extension types to cover common data +structures for representing natural language data, plus library integrations +that convert the outputs of popular natural language processing libraries into pandas DataFrames. + +### Plotting backends + +pandas uses [Matplotlib](https://matplotlib.org/) by default for plotting. This can be +changed with the with `plotting.backend` option: + +```python +pd.set_option("plotting.backend", "") +``` -## Visualization +This is the list of known plotting backends: -### [Altair](https://altair-viz.github.io/) +#### [Altair](https://altair-viz.github.io/) Altair is a declarative statistical visualization library for Python. With Altair, you can spend more time understanding your data and its @@ -69,7 +102,14 @@ top of the powerful Vega-Lite JSON specification. This elegant simplicity produces beautiful and effective visualizations with a minimal amount of code. Altair works with Pandas DataFrames. -### [Bokeh](https://docs.bokeh.org) +[altair-pandas](https://github.com/altair-viz/altair_pandas) provides +the pandas Altair backend via: + +```python +pd.set_option("plotting.backend", "altair") +``` + +#### [Bokeh](https://docs.bokeh.org) Bokeh is a Python interactive visualization library for large datasets that natively uses the latest web technologies. Its goal is to provide @@ -79,62 +119,25 @@ data to thin clients. [Pandas-Bokeh](https://github.com/PatrikHlobil/Pandas-Bokeh) provides a high level API for Bokeh that can be loaded as a native Pandas plotting -backend via +backend via: -``` +```python pd.set_option("plotting.backend", "pandas_bokeh") ``` It is very similar to the matplotlib plotting backend, but provides interactive web-based charts and maps. -### [pygwalker](https://github.com/Kanaries/pygwalker) +#### [hvplot](https://hvplot.holoviz.org/index.html) -PyGWalker is an interactive data visualization and -exploratory data analysis tool built upon Graphic Walker -with support for visualization, cleaning, and annotation workflows. - -pygwalker can save interactively created charts -to Graphic-Walker and Vega-Lite JSON. - -``` -import pygwalker as pyg -pyg.walk(df) -``` - -### [seaborn](https://seaborn.pydata.org) - -Seaborn is a Python visualization library based on -[matplotlib](https://matplotlib.org). It provides a high-level, -dataset-oriented interface for creating attractive statistical graphics. -The plotting functions in seaborn understand pandas objects and leverage -pandas grouping operations internally to support concise specification -of complex visualizations. Seaborn also goes beyond matplotlib and -pandas with the option to perform statistical estimation while plotting, -aggregating across observations and visualizing the fit of statistical -models to emphasize patterns in a dataset. +hvPlot is a high-level plotting API for the PyData ecosystem built on [HoloViews](https://holoviews.org/). +It can be loaded as a native pandas plotting backend via: +```python +pd.set_option("plotting.backend", "hvplot") ``` -import seaborn as sns -sns.set_theme() -``` - -### [plotnine](https://github.com/has2k1/plotnine/) - -Hadley Wickham's [ggplot2](https://ggplot2.tidyverse.org/) is a -foundational exploratory visualization package for the R language. Based -on ["The Grammar of -Graphics"](https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html) -it provides a powerful, declarative and extremely general way to -generate bespoke plots of any kind of data. -Various implementations to other languages are available. -A good implementation for Python users is [has2k1/plotnine](https://github.com/has2k1/plotnine/). - -### [IPython Vega](https://github.com/vega/ipyvega) -[IPython Vega](https://github.com/vega/ipyvega) leverages [Vega](https://github.com/vega/vega) to create plots within Jupyter Notebook. - -### [Plotly](https://plot.ly/python) +#### [Plotly](https://plot.ly/python) [Plotly's](https://plot.ly/) [Python API](https://plot.ly/python/) enables interactive figures and web shareability. Maps, 2D, 3D, and @@ -146,191 +149,54 @@ Seaborn](https://plot.ly/python/matplotlib-to-plotly-tutorial/) can convert figures into interactive web-based plots. Plots can be drawn in [IPython Notebooks](https://plot.ly/ipython-notebooks/) , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly -is free for unlimited sharing, and has -[cloud](https://plot.ly/product/plans/), -[offline](https://plot.ly/python/offline/), or -[on-premise](https://plot.ly/product/enterprise/) accounts for private -use. - -### [Lux](https://github.com/lux-org/lux) - -Lux is a Python library that facilitates fast and easy experimentation with data by automating the visual data exploration process. To use Lux, simply add an extra import alongside pandas: - -```python -import lux -import pandas as pd - -df = pd.read_csv("data.csv") -df # discover interesting insights! -``` - -By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. - -### [D-Tale](https://github.com/man-group/dtale) - -D-Tale is a lightweight web client for visualizing pandas data structures. It -provides a rich spreadsheet-style grid which acts as a wrapper for a lot of -pandas functionality (query, sort, describe, corr...) so users can quickly -manipulate their data. There is also an interactive chart-builder using Plotly -Dash allowing users to build nice portable visualizations. D-Tale can be -invoked with the following command - -```python -import dtale - -dtale.show(df) -``` - -D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle -& Google Colab. Here are some demos of the [grid](http://alphatechadmin.pythonanywhere.com/dtale/main/1). +is free for unlimited sharing, and has cloud, offline, or on-premise +accounts for private use. -### [hvplot](https://hvplot.holoviz.org/index.html) - -hvPlot is a high-level plotting API for the PyData ecosystem built on [HoloViews](https://holoviews.org/). -It can be loaded as a native pandas plotting backend via +Plotly can be used as a pandas plotting backend via: ```python -pd.set_option("plotting.backend", "hvplot") +pd.set_option("plotting.backend", "plotly") ``` -## IDE - -### [IPython](https://ipython.org/documentation.html) - -IPython is an interactive command shell and distributed computing -environment. IPython tab completion works with Pandas methods and also -attributes like DataFrame columns. +## Domain specific pandas extensions -### [Jupyter Notebook / Jupyter Lab](https://jupyter.org) - -Jupyter Notebook is a web application for creating Jupyter notebooks. A -Jupyter notebook is a JSON document containing an ordered list of -input/output cells which can contain code, text, mathematics, plots and -rich media. Jupyter notebooks can be converted to a number of open -standard output formats (HTML, HTML presentation slides, LaTeX, PDF, -ReStructuredText, Markdown, Python) through 'Download As' in the web -interface and `jupyter convert` in a shell. - -Pandas DataFrames implement `_repr_html_` and `_repr_latex` methods which -are utilized by Jupyter Notebook for displaying (abbreviated) HTML or -LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may -or may not be compatible with non-HTML Jupyter output formats.) - -See [Options and Settings](https://pandas.pydata.org/docs/user_guide/options.html) -for pandas `display.` settings. - -### [Spyder](https://www.spyder-ide.org/) - -Spyder is a cross-platform PyQt-based IDE combining the editing, -analysis, debugging and profiling functionality of a software -development tool with the data exploration, interactive execution, deep -inspection and rich visualization capabilities of a scientific -environment like MATLAB or Rstudio. - -Its [Variable -Explorer](https://docs.spyder-ide.org/current/panes/variableexplorer.html) allows -users to view, manipulate and edit pandas `Index`, `Series`, and -`DataFrame` objects like a "spreadsheet", including copying and -modifying values, sorting, displaying a "heatmap", converting data -types and more. Pandas objects can also be renamed, duplicated, new -columns added, copied/pasted to/from the clipboard (as TSV), and -saved/loaded to/from a file. Spyder can also import data from a variety -of plain text and binary files or the clipboard into a new pandas -DataFrame via a sophisticated import wizard. - -Most pandas classes, methods and data attributes can be autocompleted in -Spyder's [Editor](https://docs.spyder-ide.org/current/panes/editor.html) and [IPython -Console](https://docs.spyder-ide.org/current/panes/ipythonconsole.html), and Spyder's -[Help pane](https://docs.spyder-ide.org/current/panes/help.html) can retrieve and -render Numpydoc documentation on pandas objects in rich text with Sphinx -both automatically and on-demand. - -### [marimo](https://marimo.io) - -marimo is a reactive notebook for Python and SQL that enhances productivity when working with dataframes. It provides several features to make data manipulation and visualization more interactive and fun: - -1. Rich, interactive displays: marimo can display pandas dataframes in interactive tables or charts with filtering and sorting capabilities. -2. Data selection: Users can select data in tables or pandas-backed plots, and the selections are automatically sent to Python as pandas dataframes. -3. No-code transformations: Users can interactively transform pandas dataframes using a GUI, without writing code. The generated code can be copied and pasted into the notebook. -4. Custom filters: marimo allows the creation of pandas-backed filters using UI elements like sliders and dropdowns. -5. Dataset explorer: marimo automatically discovers and displays all dataframes in the notebook, allowing users to explore and visualize data interactively. -6. SQL integration: marimo allows users to write SQL queries against any pandas dataframes existing in memory. - -## API - -### [pandas-datareader](https://github.com/pydata/pandas-datareader) - -`pandas-datareader` is a remote data access library for pandas -(PyPI:`pandas-datareader`). It is based on functionality that was -located in `pandas.io.data` and `pandas.io.wb` but was split off in -v0.19. See more in the [pandas-datareader -docs](https://pandas-datareader.readthedocs.io/en/latest/): - -The following data feeds are available: - -- Google Finance -- Tiingo -- Morningstar -- IEX -- Robinhood -- Enigma -- Quandl -- FRED -- Fama/French -- World Bank -- OECD -- Eurostat -- TSP Fund Data -- Nasdaq Trader Symbol Definitions -- Stooq Index Data -- MOEX Data - -### [pandaSDMX](https://pandasdmx.readthedocs.io) - -pandaSDMX is a library to retrieve and acquire statistical data and -metadata disseminated in [SDMX](https://sdmx.org) 2.1, an -ISO-standard widely used by institutions such as statistics offices, -central banks, and international organisations. pandaSDMX can expose -datasets and related structural metadata including data flows, -code-lists, and data structure definitions as pandas Series or -MultiIndexed DataFrames. - -### [fredapi](https://github.com/mortada/fredapi) - -fredapi is a Python interface to the [Federal Reserve Economic Data -(FRED)](https://fred.stlouisfed.org/) provided by the Federal Reserve -Bank of St. Louis. It works with both the FRED database and ALFRED -database that contains point-in-time data (i.e. historic data -revisions). fredapi provides a wrapper in Python to the FRED HTTP API, -and also provides several convenient methods for parsing and analyzing -point-in-time data from ALFRED. fredapi makes use of pandas and returns -data in a Series or DataFrame. This module requires a FRED API key that -you can obtain for free on the FRED website. - -## Domain specific - -### [Geopandas](https://github.com/geopandas/geopandas) +#### [Geopandas](https://github.com/geopandas/geopandas) Geopandas extends pandas data objects to include geographic information which support geometric operations. If your work entails maps and geographical coordinates, and you love pandas, you should take a close look at Geopandas. -### [gurobipy-pandas](https://github.com/Gurobi/gurobipy-pandas) +#### [gurobipy-pandas](https://github.com/Gurobi/gurobipy-pandas) gurobipy-pandas provides a convenient accessor API to connect pandas with gurobipy. It enables users to more easily and efficiently build mathematical optimization models from data stored in DataFrames and Series, and to read solutions back directly as pandas objects. -### [staircase](https://github.com/staircase-dev/staircase) +#### [Hail Query](https://hail.is/) + +An out-of-core, preemptible-safe, distributed, dataframe library serving +the genetics community. Hail Query ships with on-disk data formats, +in-memory data formats, an expression compiler, a query planner, and a +distributed sort algorithm all designed to accelerate queries on large +matrices of genome sequencing data. + +It is often easiest to use pandas to manipulate the summary statistics or +other small aggregates produced by Hail. For this reason, Hail provides +native import to and export from pandas DataFrames: + +- [`Table.from_pandas`](https://hail.is/docs/latest/hail.Table.html#hail.Table.from_pandas) +- [`Table.to_pandas`](https://hail.is/docs/latest/hail.Table.html#hail.Table.to_pandas) + +#### [staircase](https://github.com/staircase-dev/staircase) staircase is a data analysis package, built upon pandas and numpy, for modelling and manipulation of mathematical step functions. It provides a rich variety of arithmetic operations, relational operations, logical operations, statistical operations and aggregations for step functions defined over real numbers, datetime and timedelta domains. -### [xarray](https://github.com/pydata/xarray) +#### [xarray](https://github.com/pydata/xarray) xarray brings the labeled data power of pandas to the physical sciences by providing N-dimensional variants of the core pandas data structures. @@ -338,172 +204,151 @@ It aims to provide a pandas-like and pandas-compatible toolkit for analytics on multi-dimensional arrays, rather than the tabular data for which pandas excels. -## IO - -### [NTV-pandas](https://github.com/loco-philippe/ntv-pandas) - -NTV-pandas provides a JSON converter with more data types than the ones supported by pandas directly. - -It supports the following data types: - -- pandas data types -- data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) -- data types defined in [Table Schema specification](https://datapackage.org/standard/table-schema/) - -The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). -Example: +## Data IO for pandas -```python -import ntv_pandas as npd +#### [ArcticDB](https://github.com/man-group/ArcticDB) -jsn = df.npd.to_json(table=False) # save df as a JSON-value (format Table Schema if table is True else format NTV ) -df = npd.read_json(jsn) # load a JSON-value as a `DataFrame` +ArcticDB is a serverless DataFrame database engine designed for the Python Data Science ecosystem. +ArcticDB enables you to store, retrieve, and process pandas DataFrames at scale. +It is a storage engine designed for object storage and also supports local-disk storage using LMDB. +ArcticDB requires zero additional infrastructure beyond a running Python environment and access +to object storage and can be installed in seconds. -df.equals(npd.read_json(df.npd.to_json(df))) # `True` in any case, whether `table=True` or not -``` +Please find full documentation [here](https://docs.arcticdb.io/latest/). -### [BCPandas](https://github.com/yehoshuadimarsky/bcpandas) +#### [BCPandas](https://github.com/yehoshuadimarsky/bcpandas) BCPandas provides high performance writes from pandas to Microsoft SQL Server, far exceeding the performance of the native ``df.to_sql`` method. Internally, it uses Microsoft's BCP utility, but the complexity is fully abstracted away from the end user. Rigorously tested, it is a complete replacement for ``df.to_sql``. -### [Deltalake](https://pypi.org/project/deltalake) +#### [Deltalake](https://pypi.org/project/deltalake) Deltalake python package lets you access tables stored in [Delta Lake](https://delta.io/) natively in Python without the need to use Spark or JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert any Delta table into Pandas dataframe. -### [pandas-gbq](https://github.com/googleapis/python-bigquery-pandas) - -pandas-gbq provides high performance reads and writes to and from -[Google BigQuery](https://cloud.google.com/bigquery/). Previously (before version 2.2.0), -these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`. -Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead. - - -### [ArcticDB](https://github.com/man-group/ArcticDB) +#### [fredapi](https://github.com/mortada/fredapi) -ArcticDB is a serverless DataFrame database engine designed for the Python Data Science ecosystem. ArcticDB enables you to store, retrieve, and process pandas DataFrames at scale. It is a storage engine designed for object storage and also supports local-disk storage using LMDB. ArcticDB requires zero additional infrastructure beyond a running Python environment and access to object storage and can be installed in seconds. Please find full documentation [here](https://docs.arcticdb.io/latest/). - -#### ArcticDB Terminology - -ArcticDB is structured to provide a scalable and efficient way to manage and retrieve DataFrames, organized into several key components: - -- `Object Store` Collections of libraries. Used to separate logical environments from each other. Analogous to a database server. -- `Library` Contains multiple symbols which are grouped in a certain way (different users, markets, etc). Analogous to a database. -- `Symbol` Atomic unit of data storage. Identified by a string name. Data stored under a symbol strongly resembles a pandas DataFrame. Analogous to tables. -- `Version` Every modifying action (write, append, update) performed on a symbol creates a new version of that object. +fredapi is a Python interface to the [Federal Reserve Economic Data +(FRED)](https://fred.stlouisfed.org/) provided by the Federal Reserve +Bank of St. Louis. It works with both the FRED database and ALFRED +database that contains point-in-time data (i.e. historic data +revisions). fredapi provides a wrapper in Python to the FRED HTTP API, +and also provides several convenient methods for parsing and analyzing +point-in-time data from ALFRED. fredapi makes use of pandas and returns +data in a Series or DataFrame. This module requires a FRED API key that +you can obtain for free on the FRED website. -#### Installation +#### [Hugging Face](https://huggingface.co/datasets) -To install, simply run: +The Hugging Face Dataset Hub provides a large collection of ready-to-use +datasets for machine learning shared by the community. The platform offers +a user-friendly interface to explore, discover and visualize datasets, and +provides tools to easily load and work with these datasets in Python thanks +to the [huggingface_hub](https://github.com/huggingface/huggingface_hub) library. -```console -pip install arcticdb -``` +You can access datasets on Hugging Face using `hf://` paths in pandas, +in the form `hf://datasets/username/dataset_name/...`. -To get started, we can import ArcticDB and instantiate it: +For example, here is how to load the +[stanfordnlp/imdb dataset](https://huggingface.co/datasets/stanfordnlp/imdb): ```python -import arcticdb as adb -import numpy as np import pandas as pd -# this will set up the storage using the local file system -arctic = adb.Arctic("lmdb://arcticdb_test") -``` -> **Note:** ArcticDB supports any S3 API compatible storage, including AWS. ArcticDB also supports Azure Blob storage. -> ArcticDB also supports LMDB for local/file based storage - to use LMDB, pass an LMDB path as the URI: `adb.Arctic('lmdb://path/to/desired/database')`. +# Load the IMDB dataset +df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") +``` -#### Library Setup +Tip: on a dataset page, click on "Use this dataset" to get the code to load it in pandas. -ArcticDB is geared towards storing many (potentially millions) of tables. Individual tables (DataFrames) are called symbols and are stored in collections called libraries. A single library can store many symbols. Libraries must first be initialized prior to use: +To save a dataset on Hugging Face you need to +[create a public or private dataset](https://huggingface.co/new-dataset) and +[login](https://huggingface.co/docs/huggingface_hub/quick-start#login-command), +and then you can use `df.to_csv/to_json/to_parquet`: ```python -lib = arctic.get_library('sample', create_if_missing=True) +# Save the dataset to my Hugging Face account +df.to_parquet("hf://datasets/username/dataset_name/train.parquet") ``` -#### Writing Data to ArcticDB +You can find more information about the Hugging Face Dataset Hub in the [documentation](https://huggingface.co/docs/hub/en/datasets). -Now we have a library set up, we can get to reading and writing data. ArcticDB has a set of simple functions for DataFrame storage. Let's write a DataFrame to storage. +#### [NTV-pandas](https://github.com/loco-philippe/ntv-pandas) -```python -df = pd.DataFrame( - { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.date_range("20130101", periods=3) - } -) - -df -df.dtypes -``` - -Write to ArcticDB. +NTV-pandas provides a JSON converter with more data types than the ones supported by pandas directly. -```python -write_record = lib.write("test", df) -``` +It supports the following data types: -> **Note:** When writing pandas DataFrames, ArcticDB supports the following index types: -> -> - `pandas.Index` containing int64 (or the corresponding dedicated types Int64Index, UInt64Index) -> - `RangeIndex` -> - `DatetimeIndex` -> - `MultiIndex` composed of above supported types -> -> The "row" concept in `head`/`tail` refers to the row number ('iloc'), not the value in the `pandas.Index` ('loc'). +- pandas data types +- data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) +- data types defined in [Table Schema specification](https://datapackage.org/standard/table-schema/) -#### Reading Data from ArcticDB +The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). -Read the data back from storage: +Example: ```python -read_record = lib.read("test") -read_record.data -df.dtypes -``` - -ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/). +import ntv_pandas as npd -### [Hugging Face](https://huggingface.co/datasets) +jsn = df.npd.to_json(table=False) # save df as a JSON-value (format Table Schema if table is True else format NTV ) +df = npd.read_json(jsn) # load a JSON-value as a `DataFrame` -The Hugging Face Dataset Hub provides a large collection of ready-to-use datasets for machine learning shared by the community. The platform offers a user-friendly interface to explore, discover and visualize datasets, and provides tools to easily load and work with these datasets in Python thanks to the [huggingface_hub](https://github.com/huggingface/huggingface_hub) library. +df.equals(npd.read_json(df.npd.to_json(df))) # `True` in any case, whether `table=True` or not +``` -You can access datasets on Hugging Face using `hf://` paths in pandas, in the form `hf://datasets/username/dataset_name/...`. +#### [pandas-datareader](https://github.com/pydata/pandas-datareader) -For example, here is how to load the [stanfordnlp/imdb dataset](https://huggingface.co/datasets/stanfordnlp/imdb): +`pandas-datareader` is a remote data access library for pandas +(PyPI:`pandas-datareader`). It is based on functionality that was +located in `pandas.io.data` and `pandas.io.wb` but was split off in +v0.19. See more in the [pandas-datareader +docs](https://pandas-datareader.readthedocs.io/en/latest/): -```python -import pandas as pd +The following data feeds are available: -# Load the IMDB dataset -df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") -``` +- Google Finance +- Tiingo +- Morningstar +- IEX +- Robinhood +- Enigma +- Quandl +- FRED +- Fama/French +- World Bank +- OECD +- Eurostat +- TSP Fund Data +- Nasdaq Trader Symbol Definitions +- Stooq Index Data +- MOEX Data -Tip: on a dataset page, click on "Use this dataset" to get the code to load it in pandas. +#### [pandas-gbq](https://github.com/googleapis/python-bigquery-pandas) -To save a dataset on Hugging Face you need to [create a public or private dataset](https://huggingface.co/new-dataset) and [login](https://huggingface.co/docs/huggingface_hub/quick-start#login-command), and then you can use `df.to_csv/to_json/to_parquet`: +pandas-gbq provides high performance reads and writes to and from +[Google BigQuery](https://cloud.google.com/bigquery/). Previously (before version 2.2.0), +these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`. +Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead. -```python -# Save the dataset to my Hugging Face account -df.to_parquet("hf://datasets/username/dataset_name/train.parquet") -``` +#### [pandaSDMX](https://pandasdmx.readthedocs.io) -You can find more information about the Hugging Face Dataset Hub in the [documentation](https://huggingface.co/docs/hub/en/datasets). +pandaSDMX is a library to retrieve and acquire statistical data and +metadata disseminated in [SDMX](https://sdmx.org) 2.1, an +ISO-standard widely used by institutions such as statistics offices, +central banks, and international organisations. pandaSDMX can expose +datasets and related structural metadata including data flows, +code-lists, and data structure definitions as pandas Series or +MultiIndexed DataFrames. -## Out-of-core -### [Bodo](https://github.com/bodo-ai/Bodo) +## Scaling pandas +#### [Bodo](https://github.com/bodo-ai/Bodo) Bodo is a high-performance compute engine for Python data processing. Using an auto-parallelizing just-in-time (JIT) compiler, Bodo simplifies scaling Pandas @@ -525,59 +370,26 @@ def process_data(): process_data() ``` - -### [Cylon](https://cylondata.org/) - -Cylon is a fast, scalable, distributed memory parallel runtime with a pandas -like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache -Arrow format to represent the data in-memory. Cylon DataFrame API implements -most of the core operators of pandas such as merge, filter, join, concat, -group-by, drop_duplicates, etc. These operators are designed to work across -thousands of cores to scale applications. It can interoperate with pandas -DataFrame by reading data from pandas or converting data to pandas so users -can selectively scale parts of their pandas DataFrame applications. - -```python -from pycylon import read_csv, DataFrame, CylonEnv -from pycylon.net import MPIConfig - -# Initialize Cylon distributed environment -config: MPIConfig = MPIConfig() -env: CylonEnv = CylonEnv(config=config, distributed=True) - -df1: DataFrame = read_csv('/tmp/csv1.csv') -df2: DataFrame = read_csv('/tmp/csv2.csv') - -# Using 1000s of cores across the cluster to compute the join -df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env) - -print(df3) -``` - -### [Dask](https://docs.dask.org) +#### [Dask](https://docs.dask.org) Dask is a flexible parallel computing library for analytics. Dask provides a familiar `DataFrame` interface for out-of-core, parallel and distributed computing. -### [Dask-ML](https://ml.dask.org) - -Dask-ML enables parallel and distributed machine learning using Dask -alongside existing machine learning libraries like Scikit-Learn, -XGBoost, and TensorFlow. - -### [Ibis](https://ibis-project.org/docs/) +#### [Ibis](https://ibis-project.org/docs/) -Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). +Ibis offers a standard way to write analytics code, that can be run in +multiple engines. It helps in bridging the gap between local Python environments +(like pandas) and remote storage and execution systems like Hadoop components +(like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). - -### [Koalas](https://koalas.readthedocs.io/en/latest/) +#### [Koalas](https://koalas.readthedocs.io/en/latest/) Koalas provides a familiar pandas DataFrame interface on top of Apache Spark. It enables users to leverage multi-cores on one machine or a cluster of machines to speed up or scale their DataFrame code. -### [Modin](https://github.com/modin-project/modin) +#### [Modin](https://github.com/modin-project/modin) The ``modin.pandas`` DataFrame is a parallel and distributed drop-in replacement for pandas. This means that you can use Modin with existing pandas code or write @@ -593,153 +405,222 @@ import modin.pandas as pd df = pd.read_csv("big.csv") # use all your cores! ``` -### [Pandarallel](https://github.com/nalepae/pandarallel) -Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. -It also displays progress bars. +## Data cleaning and validation for pandas -```python -from pandarallel import pandarallel +#### [Pandera](https://pandera.readthedocs.io/en/stable/) -pandarallel.initialize(progress_bar=True) +Pandera provides a flexible and expressive API for performing data validation on dataframes +to make data processing pipelines more readable and robust. +Dataframes contain information that pandera explicitly validates at runtime. This is useful in +production-critical data pipelines or reproducible research settings. -# df.apply(func) -df.parallel_apply(func) -``` +#### [pyjanitor](https://github.com/pyjanitor-devs/pyjanitor) -### [Vaex](https://vaex.io/docs/) +Pyjanitor provides a clean API for cleaning data, using method chaining. -Increasingly, packages are being built on top of pandas to address -specific needs in data preparation, analysis and visualization. Vaex is -a python library for Out-of-Core DataFrames (similar to Pandas), to -visualize and explore big tabular datasets. It can calculate statistics -such as mean, sum, count, standard deviation etc, on an N-dimensional -grid up to a billion (10^9) objects/rows per second. Visualization is -done using histograms, density plots and 3d volume rendering, allowing -interactive exploration of big data. Vaex uses memory mapping, zero -memory copy policy and lazy computations for best performance (no memory -wasted). -- ``vaex.from_pandas`` -- ``vaex.to_pandas_df`` +## Development tools for pandas -### [Hail Query](https://hail.is/) +#### [Hamilton](https://github.com/dagworks-inc/hamilton) -An out-of-core, preemptible-safe, distributed, dataframe library serving -the genetics community. Hail Query ships with on-disk data formats, -in-memory data formats, an expression compiler, a query planner, and a -distributed sort algorithm all designed to accelerate queries on large -matrices of genome sequencing data. +Hamilton is a declarative dataflow framework that came out of Stitch Fix. It was +designed to help one manage a Pandas code base, specifically with respect to +feature engineering for machine learning models. -It is often easiest to use pandas to manipulate the summary statistics or -other small aggregates produced by Hail. For this reason, Hail provides -native import to and export from pandas DataFrames: +It prescribes an opinionated paradigm, that ensures all code is: -- [`Table.from_pandas`](https://hail.is/docs/latest/hail.Table.html#hail.Table.from_pandas) -- [`Table.to_pandas`](https://hail.is/docs/latest/hail.Table.html#hail.Table.to_pandas) +* unit testable +* integration testing friendly +* documentation friendly +* transformation logic is reusable, as it is decoupled from the context of where it is used. +* integratable with runtime data quality checks. -## Data cleaning and validation +This helps one to scale your pandas code base, at the same time, keeping maintenance costs low. -### [pyjanitor](https://github.com/pyjanitor-devs/pyjanitor) +For more information, see [documentation](https://hamilton.readthedocs.io/). -Pyjanitor provides a clean API for cleaning data, using method chaining. +#### [IPython](https://ipython.org/documentation.html) -### [Pandera](https://pandera.readthedocs.io/en/stable/) +IPython is an interactive command shell and distributed computing +environment. IPython tab completion works with Pandas methods and also +attributes like DataFrame columns. -Pandera provides a flexible and expressive API for performing data validation on dataframes -to make data processing pipelines more readable and robust. -Dataframes contain information that pandera explicitly validates at runtime. This is useful in -production-critical data pipelines or reproducible research settings. +#### [Jupyter Notebook / Jupyter Lab](https://jupyter.org) -## Extension data types +Jupyter Notebook is a web application for creating Jupyter notebooks. A +Jupyter notebook is a JSON document containing an ordered list of +input/output cells which can contain code, text, mathematics, plots and +rich media. Jupyter notebooks can be converted to a number of open +standard output formats (HTML, HTML presentation slides, LaTeX, PDF, +ReStructuredText, Markdown, Python) through 'Download As' in the web +interface and `jupyter convert` in a shell. -Pandas provides an interface for defining -[extension types](https://pandas.pydata.org/docs/development/extending.html#extension-types) to extend NumPy's type system. -The following libraries implement that interface to provide types not found in NumPy or pandas, -which work well with pandas' data containers. +Pandas DataFrames implement `_repr_html_` and `_repr_latex` methods which +are utilized by Jupyter Notebook for displaying (abbreviated) HTML or +LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may +or may not be compatible with non-HTML Jupyter output formats.) -### [awkward-pandas](https://awkward-pandas.readthedocs.io/) +See [Options and Settings](https://pandas.pydata.org/docs/user_guide/options.html) +for pandas `display.` settings. -Awkward-pandas provides an extension type for storing [Awkward -Arrays](https://awkward-array.org/) inside pandas' Series and -DataFrame. It also provides an accessor for using awkward functions -on Series that are of awkward type. +#### [marimo](https://marimo.io) -### [db-dtypes](https://github.com/googleapis/python-db-dtypes-pandas) +marimo is a reactive notebook for Python and SQL that enhances productivity +when working with dataframes. It provides several features to make data +manipulation and visualization more interactive and fun: -db-dtypes provides an extension types for working with types like -DATE, TIME, and JSON from database systems. This package is used -by pandas-gbq to provide natural dtypes for BigQuery data types without -a natural numpy type. +1. Rich, interactive displays: marimo can display pandas dataframes in + interactive tables or charts with filtering and sorting capabilities. +2. Data selection: Users can select data in tables or pandas-backed plots, + and the selections are automatically sent to Python as pandas dataframes. +3. No-code transformations: Users can interactively transform pandas dataframes + using a GUI, without writing code. The generated code can be copied and pasted into the notebook. +4. Custom filters: marimo allows the creation of pandas-backed filters using + UI elements like sliders and dropdowns. +5. Dataset explorer: marimo automatically discovers and displays all dataframes + in the notebook, allowing users to explore and visualize data interactively. +6. SQL integration: marimo allows users to write SQL queries against any + pandas dataframes existing in memory. -### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/) +#### [pandas-stubs](https://github.com/VirtusLab/pandas-stubs) -Pandas-Genomics provides an extension type and extension array for working - with genomics data. It also includes `genomics` accessors for many useful properties - and methods related to QC and analysis of genomics data. +While pandas repository is partially typed, the package itself doesn't expose this information for external use. +Install pandas-stubs to enable basic type coverage of pandas API. -### [Physipandas](https://github.com/mocquin/physipandas) +Learn more by reading through these issues [14468](https://github.com/pandas-dev/pandas/issues/14468), +[26766](https://github.com/pandas-dev/pandas/issues/26766), [28142](https://github.com/pandas-dev/pandas/issues/28142). -Physipandas provides an extension for manipulating physical quantities - (like scalar and numpy.ndarray) in association with a physical unit - (like meter or joule) and additional features for integration of - `physipy` accessors with pandas Series and Dataframe. +See installation and usage instructions on the [GitHub page](https://github.com/VirtusLab/pandas-stubs). -### [Pint-Pandas](https://github.com/hgrecco/pint-pandas) +#### [Spyder](https://www.spyder-ide.org/) -Pint-Pandas provides an extension type for storing numeric arrays with units. -These arrays can be stored inside pandas' Series and DataFrame. Operations -between Series and DataFrame columns which use pint's extension array are then -units aware. +Spyder is a cross-platform PyQt-based IDE combining the editing, +analysis, debugging and profiling functionality of a software +development tool with the data exploration, interactive execution, deep +inspection and rich visualization capabilities of a scientific +environment like MATLAB or Rstudio. -### [Text Extensions](https://ibm.biz/text-extensions-for-pandas) +Its [Variable +Explorer](https://docs.spyder-ide.org/current/panes/variableexplorer.html) allows +users to view, manipulate and edit pandas `Index`, `Series`, and +`DataFrame` objects like a "spreadsheet", including copying and +modifying values, sorting, displaying a "heatmap", converting data +types and more. Pandas objects can also be renamed, duplicated, new +columns added, copied/pasted to/from the clipboard (as TSV), and +saved/loaded to/from a file. Spyder can also import data from a variety +of plain text and binary files or the clipboard into a new pandas +DataFrame via a sophisticated import wizard. -Text Extensions for Pandas provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into pandas DataFrames. +Most pandas classes, methods and data attributes can be autocompleted in +Spyder's [Editor](https://docs.spyder-ide.org/current/panes/editor.html) and [IPython +Console](https://docs.spyder-ide.org/current/panes/ipythonconsole.html), and Spyder's +[Help pane](https://docs.spyder-ide.org/current/panes/help.html) can retrieve and +render Numpydoc documentation on pandas objects in rich text with Sphinx +both automatically and on-demand. -## Accessors -A directory of projects providing -[extension accessors](https://pandas.pydata.org/docs/development/extending.html#registering-custom-accessors). -This is for users to discover new accessors and for library -authors to coordinate on the namespace. +## Other related libraries - | Library | Accessor | Classes | - | -------------------------------------------------------------------- | ---------- | --------------------- | - | [awkward-pandas](https://awkward-pandas.readthedocs.io/en/latest/) | `ak` | `Series` | - | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | - | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` | - | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | - | [physipandas](https://github.com/mocquin/physipandas) | `physipy` | `Series`, `DataFrame` | - | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | - | [gurobipy-pandas](https://github.com/Gurobi/gurobipy-pandas) | `gppd` | `Series`, `DataFrame` | - | [staircase](https://www.staircase.dev/) | `sc` | `Series`, `DataFrame` | - | [woodwork](https://github.com/alteryx/woodwork) | `slice` | `Series`, `DataFrame` | +#### [Compose](https://github.com/alteryx/compose) -## Development tools +Compose is a machine learning tool for labeling data and prediction engineering. +It allows you to structure the labeling process by parameterizing +prediction problems and transforming time-driven relational data into +target values with cutoff times that can be used for supervised learning. -### [pandas-stubs](https://github.com/VirtusLab/pandas-stubs) +#### [D-Tale](https://github.com/man-group/dtale) -While pandas repository is partially typed, the package itself doesn't expose this information for external use. -Install pandas-stubs to enable basic type coverage of pandas API. +D-Tale is a lightweight web client for visualizing pandas data structures. It +provides a rich spreadsheet-style grid which acts as a wrapper for a lot of +pandas functionality (query, sort, describe, corr...) so users can quickly +manipulate their data. There is also an interactive chart-builder using Plotly +Dash allowing users to build nice portable visualizations. D-Tale can be +invoked with the following command -Learn more by reading through these issues [14468](https://github.com/pandas-dev/pandas/issues/14468), -[26766](https://github.com/pandas-dev/pandas/issues/26766), [28142](https://github.com/pandas-dev/pandas/issues/28142). +```python +import dtale -See installation and usage instructions on the [GitHub page](https://github.com/VirtusLab/pandas-stubs). +dtale.show(df) +``` -### [Hamilton](https://github.com/dagworks-inc/hamilton) +D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle +& Google Colab. Here are some demos of the +[grid](http://alphatechadmin.pythonanywhere.com/dtale/main/1). -Hamilton is a declarative dataflow framework that came out of Stitch Fix. It was designed to help one manage a -Pandas code base, specifically with respect to feature engineering for machine learning models. +#### [Featuretools](https://github.com/alteryx/featuretools/) -It prescribes an opinionated paradigm, that ensures all code is: +Featuretools is a Python library for automated feature engineering built +on top of pandas. It excels at transforming temporal and relational +datasets into feature matrices for machine learning using reusable +feature engineering "primitives". Users can contribute their own +primitives in Python and share them with the rest of the community. -* unit testable -* integration testing friendly -* documentation friendly -* transformation logic is reusable, as it is decoupled from the context of where it is used. -* integratable with runtime data quality checks. +#### [IPython Vega](https://github.com/vega/ipyvega) -This helps one to scale your pandas code base, at the same time, keeping maintenance costs low. +[IPython Vega](https://github.com/vega/ipyvega) leverages +[Vega](https://github.com/vega/vega) to create plots within Jupyter Notebook. -For more information, see [documentation](https://hamilton.readthedocs.io/). +#### [plotnine](https://github.com/has2k1/plotnine/) + +Hadley Wickham's [ggplot2](https://ggplot2.tidyverse.org/) is a +foundational exploratory visualization package for the R language. Based +on ["The Grammar of +Graphics"](https://doi.org/10.1007/0-387-28695-0) +it provides a powerful, declarative and extremely general way to +generate bespoke plots of any kind of data. +Various implementations to other languages are available. +A good implementation for Python users is [has2k1/plotnine](https://github.com/has2k1/plotnine/). + +#### [pygwalker](https://github.com/Kanaries/pygwalker) + +PyGWalker is an interactive data visualization and +exploratory data analysis tool built upon Graphic Walker +with support for visualization, cleaning, and annotation workflows. + +pygwalker can save interactively created charts +to Graphic-Walker and Vega-Lite JSON. + +``` +import pygwalker as pyg +pyg.walk(df) +``` + +#### [seaborn](https://seaborn.pydata.org) + +Seaborn is a Python visualization library based on +[matplotlib](https://matplotlib.org). It provides a high-level, +dataset-oriented interface for creating attractive statistical graphics. +The plotting functions in seaborn understand pandas objects and leverage +pandas grouping operations internally to support concise specification +of complex visualizations. Seaborn also goes beyond matplotlib and +pandas with the option to perform statistical estimation while plotting, +aggregating across observations and visualizing the fit of statistical +models to emphasize patterns in a dataset. + +``` +import seaborn as sns +sns.set_theme() +``` + +#### [skrub](https://skrub-data.org) + +Skrub facilitates machine learning on dataframes. It bridges pandas +to scikit-learn and related. In particular it facilitates building +features from dataframes. + +#### [Statsmodels](https://www.statsmodels.org/) + +Statsmodels is the prominent Python "statistics and econometrics +library" and it has a long-standing special relationship with pandas. +Statsmodels provides powerful statistics, econometrics, analysis and +modeling functionality that is out of pandas' scope. Statsmodels +leverages pandas objects as the underlying data container for +computation. + +#### [STUMPY](https://github.com/TDAmeritrade/stumpy) + +STUMPY is a powerful and scalable Python library for modern time series analysis. +At its core, STUMPY efficiently computes something called a +[matrix profile](https://stumpy.readthedocs.io/en/latest/Tutorial_The_Matrix_Profile.html), +which can be used for a wide variety of time series data mining tasks. diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 679778330b68d..3ff22d574f171 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -14,7 +14,6 @@ main: - pandas_web.Preprocessors.home_add_releases - pandas_web.Preprocessors.roadmap_pdeps markdown_extensions: - - toc - tables - fenced_code - meta @@ -82,15 +81,11 @@ maintainers: - simonjayhawkins - topper-123 - alimcmaster1 - - bashtage - Dr-Irv - rhshadrach - phofl - attack68 - fangchenli - - lithomas1 - - lukemanley - - noatamir inactive: - lodagro - jseabold @@ -108,6 +103,10 @@ maintainers: - mzeitlin11 - twoertwein - MarcoGorelli + - bashtage + - noatamir + - lithomas1 + - lukemanley workgroups: coc: name: Code of Conduct @@ -146,11 +145,6 @@ sponsors: url: https://numfocus.org/ logo: static/img/partners/numfocus.svg kind: numfocus - - name: "Coiled" - url: https://www.coiled.io - logo: static/img/partners/coiled.svg - kind: partner - description: "Patrick Hoefler" - name: "Nvidia" url: https://www.nvidia.com logo: static/img/partners/nvidia.svg @@ -192,5 +186,20 @@ sponsors: - name: "d-fine GmbH" url: https://www.d-fine.com/en/ kind: partner + - name: "Two Sigma" + url: https://www.twosigma.com/ + kind: partner + - name: "Voltron Data" + url: https://voltrondata.com/ + kind: partner + - name: "Intel" + url: https://www.intel.com/ + kind: partner + - name: "Chan Zuckerberg Initiative" + url: https://chanzuckerberg.com/ + kind: regular + - name: "Coiled" + url: https://www.coiled.io + kind: partner roadmap: pdeps_path: pdeps diff --git a/web/pandas/index.html b/web/pandas/index.html index bbd8632e06840..c520a16b8160f 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -96,6 +96,11 @@

Recommended books

Python for Data Analysis

+

+ + Pandas Cookbook, Third Edition + +

Effective pandas 2 diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md index 7f5f0326eba6c..ed084a730ecdc 100644 --- a/web/pandas/pdeps/0001-purpose-and-guidelines.md +++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md @@ -8,6 +8,8 @@ [Noa Tamir](https://github.com/noatamir) - Revision: 3 +[TOC] + ## PDEP definition, purpose and scope A PDEP (pandas enhancement proposal) is a proposal for a **major** change in diff --git a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md index 68c6dfa26d1f1..b3f277326319e 100644 --- a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md +++ b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md @@ -6,6 +6,8 @@ - Author: [Marco Gorelli](https://github.com/MarcoGorelli) - Revision: 2 +[TOC] + ## Abstract The suggestion is that: diff --git a/web/pandas/pdeps/0005-no-default-index-mode.md b/web/pandas/pdeps/0005-no-default-index-mode.md index d543a4718e896..81222b51817d5 100644 --- a/web/pandas/pdeps/0005-no-default-index-mode.md +++ b/web/pandas/pdeps/0005-no-default-index-mode.md @@ -6,6 +6,8 @@ - Author: [Marco Gorelli](https://github.com/MarcoGorelli) - Revision: 2 +[TOC] + ## Abstract The suggestion is to add a ``NoRowIndex`` class. Internally, it would act a bit like diff --git a/web/pandas/pdeps/0006-ban-upcasting.md b/web/pandas/pdeps/0006-ban-upcasting.md index ae5872186bf23..59f2cc35bf6ee 100644 --- a/web/pandas/pdeps/0006-ban-upcasting.md +++ b/web/pandas/pdeps/0006-ban-upcasting.md @@ -6,6 +6,8 @@ - Author: [Marco Gorelli](https://github.com/MarcoGorelli) ([original issue](https://github.com/pandas-dev/pandas/issues/39584) by [Joris Van den Bossche](https://github.com/jorisvandenbossche)) - Revision: 1 +[TOC] + ## Abstract The suggestion is that setitem-like operations would diff --git a/web/pandas/pdeps/0007-copy-on-write.md b/web/pandas/pdeps/0007-copy-on-write.md index f5adb6a571120..5e35cf01de977 100644 --- a/web/pandas/pdeps/0007-copy-on-write.md +++ b/web/pandas/pdeps/0007-copy-on-write.md @@ -6,6 +6,8 @@ - Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) - Revision: 1 +[TOC] + ## Abstract Short summary of the proposal: @@ -525,7 +527,7 @@ following cases: * Selecting a single column (as a Series) out of a DataFrame is always a view (``df['a']``) * Slicing columns from a DataFrame creating a subset DataFrame (``df[['a':'b']]`` or - ``df.loc[:, 'a': 'b']``) is a view _if_ the the original DataFrame consists of a + ``df.loc[:, 'a': 'b']``) is a view _if_ the original DataFrame consists of a single block (single dtype, consolidated) and _if_ you are slicing (so not a list selection). In all other cases, getting a subset is always a copy. * Selecting rows _can_ return a view, when the row indexer is a `slice` object. diff --git a/web/pandas/pdeps/0009-io-extensions.md b/web/pandas/pdeps/0009-io-extensions.md index aeda990cea7df..baa661957e951 100644 --- a/web/pandas/pdeps/0009-io-extensions.md +++ b/web/pandas/pdeps/0009-io-extensions.md @@ -7,6 +7,8 @@ - Author: [Marc Garcia](https://github.com/datapythonista) - Revision: 1 +[TOC] + ## PDEP Summary This document proposes that third-party projects implementing I/O or memory diff --git a/web/pandas/pdeps/0010-required-pyarrow-dependency.md b/web/pandas/pdeps/0010-required-pyarrow-dependency.md index 0c3bf3c776988..60ed8c4b910eb 100644 --- a/web/pandas/pdeps/0010-required-pyarrow-dependency.md +++ b/web/pandas/pdeps/0010-required-pyarrow-dependency.md @@ -8,6 +8,8 @@ [Patrick Hoefler](https://github.com/phofl) - Revision: 1 +[TOC] + ## Abstract This PDEP proposes that: diff --git a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md index 71f669825f979..1c513c3bb517b 100644 --- a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md +++ b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md @@ -8,36 +8,7 @@ - Author: [Philippe THOMY](https://github.com/loco-philippe) - Revision: 3 -##### Summary - -- [Abstract](./0012-compact-and-reversible-JSON-interface.md/#Abstract) - - [Problem description](./0012-compact-and-reversible-JSON-interface.md/#Problem-description) - - [Feature Description](./0012-compact-and-reversible-JSON-interface.md/#Feature-Description) -- [Scope](./0012-compact-and-reversible-JSON-interface.md/#Scope) -- [Motivation](./0012-compact-and-reversible-JSON-interface.md/#Motivation) - - [Why is it important to have a compact and reversible JSON interface ?](./0012-compact-and-reversible-JSON-interface.md/#Why-is-it-important-to-have-a-compact-and-reversible-JSON-interface-?) - - [Is it relevant to take an extended type into account ?](./0012-compact-and-reversible-JSON-interface.md/#Is-it-relevant-to-take-an-extended-type-into-account-?) - - [Is this only useful for pandas ?](./0012-compact-and-reversible-JSON-interface.md/#Is-this-only-useful-for-pandas-?) -- [Description](./0012-compact-and-reversible-JSON-interface.md/#Description) - - [Data typing](./0012-compact-and-reversible-JSON-interface.md/#Data-typing) - - [Correspondence between TableSchema and pandas](./panda0012-compact-and-reversible-JSON-interfaces_PDEP.md/#Correspondence-between-TableSchema-and-pandas) - - [JSON format](./0012-compact-and-reversible-JSON-interface.md/#JSON-format) - - [Conversion](./0012-compact-and-reversible-JSON-interface.md/#Conversion) -- [Usage and impact](./0012-compact-and-reversible-JSON-interface.md/#Usage-and-impact) - - [Usage](./0012-compact-and-reversible-JSON-interface.md/#Usage) - - [Compatibility](./0012-compact-and-reversible-JSON-interface.md/#Compatibility) - - [Impacts on the pandas framework](./0012-compact-and-reversible-JSON-interface.md/#Impacts-on-the-pandas-framework) - - [Risk to do / risk not to do](./0012-compact-and-reversible-JSON-interface.md/#Risk-to-do-/-risk-not-to-do) -- [Implementation](./0012-compact-and-reversible-JSON-interface.md/#Implementation) - - [Modules](./0012-compact-and-reversible-JSON-interface.md/#Modules) - - [Implementation options](./0012-compact-and-reversible-JSON-interface.md/#Implementation-options) -- [F.A.Q.](./0012-compact-and-reversible-JSON-interface.md/#F.A.Q.) -- [Synthesis](./0012-compact-and-reversible-JSON-interface.md/Synthesis) -- [Core team decision](./0012-compact-and-reversible-JSON-interface.md/#Core-team-decision) -- [Timeline](./0012-compact-and-reversible-JSON-interface.md/#Timeline) -- [PDEP history](./0012-compact-and-reversible-JSON-interface.md/#PDEP-history) - -------------------------- +[TOC] ## Abstract diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/pdeps/0014-string-dtype.md index 5b74f71216454..35b5725341534 100644 --- a/web/pandas/pdeps/0014-string-dtype.md +++ b/web/pandas/pdeps/0014-string-dtype.md @@ -220,8 +220,8 @@ in pandas 2.3 and removed in pandas 3.0. The `storage` keyword of `StringDtype` is kept to disambiguate the underlying storage of the string data (using pyarrow or python objects), but an additional -`na_value` is introduced to disambiguate the the variants using NA semantics -and NaN semantics. +`na_value` is introduced to disambiguate the variants using NA semantics and +NaN semantics. Overview of the different ways to specify a dtype and the resulting concrete dtype of the data: diff --git a/web/pandas/static/css/pandas.css b/web/pandas/static/css/pandas.css index ec9a4bd502dd1..59904606040be 100644 --- a/web/pandas/static/css/pandas.css +++ b/web/pandas/static/css/pandas.css @@ -1,3 +1,6 @@ +html { + scroll-padding-top: 5rem; +} body { padding-top: 5em; color: #444; @@ -11,6 +14,7 @@ h2 { font-size: 1.8rem; font-weight: 700; color: #130654; + margin: 2.4rem 0 1.2rem; } h3 { font-size: 1.3rem; @@ -21,6 +25,15 @@ h3 a { color: black; text-decoration: underline dotted !important; } +h4 { + font-size: 1.1rem; + font-weight: 600; + color: black; +} +h4 a { + color: black; + text-decoration: underline dotted !important; +} a { color: #130654; } @@ -103,3 +116,22 @@ blockquote { color: #787878; font-size: 18px; } +.toc { + background: #f9f9f9; + padding: 1em; + border: 0.1em solid darkgrey; + border-radius: 0.4em; + display: inline-block; + margin: 1em 0; +} +.toc .toctitle { + font-weight: bold; + padding-bottom: 1em; +} +a.headerlink { + opacity: 0; +} +h2:hover a.headerlink, h3:hover a.headerlink { + opacity: 1; + transition: opacity 0.5s; +} diff --git a/web/pandas/static/img/books/pandas_cookbook_3.gif b/web/pandas/static/img/books/pandas_cookbook_3.gif new file mode 100644 index 0000000000000..aa9d351d489e0 Binary files /dev/null and b/web/pandas/static/img/books/pandas_cookbook_3.gif differ diff --git a/web/pandas/static/img/partners/coiled.svg b/web/pandas/static/img/partners/coiled.svg deleted file mode 100644 index 2d76ce150084b..0000000000000 --- a/web/pandas/static/img/partners/coiled.svg +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/web/pandas/versions.json b/web/pandas/versions.json index 2d2599ae8585b..1274d2125bb2b 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -5,11 +5,16 @@ "url": "https://pandas.pydata.org/docs/dev/" }, { - "name": "2.2 (stable)", - "version": "2.2", + "name": "2.3 (stable)", + "version": "2.3", "url": "https://pandas.pydata.org/docs/", "preferred": true }, + { + "name": "2.2", + "version": "2.2", + "url": "https://pandas.pydata.org/pandas-docs/version/2.2/" + }, { "name": "2.1", "version": "2.1", diff --git a/web/pandas_web.py b/web/pandas_web.py index b3872b829c73a..dec03d00574f1 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -42,6 +42,7 @@ import feedparser import jinja2 import markdown +from markdown.extensions.toc import TocExtension from packaging import version import requests import yaml @@ -100,20 +101,15 @@ def blog_add_posts(context): posts = [] # posts from the file system if context["blog"]["posts_path"]: - posts_path = os.path.join( - context["source_path"], *context["blog"]["posts_path"].split("/") - ) - for fname in os.listdir(posts_path): - if fname.startswith("index."): + posts_path = context["source_path"] / context["blog"]["posts_path"] + for fname in posts_path.iterdir(): + if fname.name.startswith("index."): continue - link = ( - f"/{context['blog']['posts_path']}" - f"/{os.path.splitext(fname)[0]}.html" - ) + link = f"/{context['blog']['posts_path']}/{fname.stem}.html" md = markdown.Markdown( extensions=context["main"]["markdown_extensions"] ) - with open(os.path.join(posts_path, fname), encoding="utf-8") as f: + with fname.open(encoding="utf-8") as f: html = md.convert(f.read()) title = md.Meta["title"][0] summary = re.sub(tag_expr, "", html) @@ -386,15 +382,15 @@ def get_callable(obj_as_str: str) -> object: return obj -def get_context(config_fname: str, **kwargs): +def get_context(config_fname: pathlib.Path, **kwargs): """ Load the config yaml as the base context, and enrich it with the information added by the context preprocessors defined in the file. """ - with open(config_fname, encoding="utf-8") as f: + with config_fname.open(encoding="utf-8") as f: context = yaml.safe_load(f) - context["source_path"] = os.path.dirname(config_fname) + context["source_path"] = config_fname.parent context.update(kwargs) preprocessors = ( @@ -409,14 +405,13 @@ def get_context(config_fname: str, **kwargs): return context -def get_source_files(source_path: str) -> typing.Generator[str, None, None]: +def get_source_files(source_path: pathlib.Path) -> typing.Generator[str, None, None]: """ Generate the list of files present in the source directory. """ - for root, dirs, fnames in os.walk(source_path): - root_rel_path = os.path.relpath(root, source_path) - for fname in fnames: - yield os.path.join(root_rel_path, fname) + for path in source_path.rglob("*"): + if path.is_file(): + yield path.relative_to(source_path) def extend_base_template(content: str, base_template: str) -> str: @@ -432,8 +427,8 @@ def extend_base_template(content: str, base_template: str) -> str: def main( - source_path: str, - target_path: str, + source_path: pathlib.Path, + target_path: pathlib.Path, ) -> int: """ Copy every file in the source directory to the target directory. @@ -441,7 +436,18 @@ def main( For ``.md`` and ``.html`` files, render them with the context before copying them. ``.md`` files are transformed to HTML. """ - config_fname = os.path.join(source_path, "config.yml") + + # Sanity check: validate that versions.json is valid JSON + versions_path = source_path / "versions.json" + with versions_path.open(encoding="utf-8") as f: + try: + json.load(f) + except json.JSONDecodeError as e: + raise RuntimeError( + f"Invalid versions.json: {e}. Ensure it is valid JSON." + ) from e + + config_fname = source_path / "config.yml" shutil.rmtree(target_path, ignore_errors=True) os.makedirs(target_path, exist_ok=True) @@ -450,40 +456,40 @@ def main( context = get_context(config_fname, target_path=target_path) sys.stderr.write("Context generated\n") - templates_path = os.path.join(source_path, context["main"]["templates_path"]) + templates_path = source_path / context["main"]["templates_path"] jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_path)) for fname in get_source_files(source_path): - if os.path.normpath(fname) in context["main"]["ignore"]: + if fname.as_posix() in context["main"]["ignore"]: continue - sys.stderr.write(f"Processing {fname}\n") - dirname = os.path.dirname(fname) - os.makedirs(os.path.join(target_path, dirname), exist_ok=True) + dirname = fname.parent + (target_path / dirname).mkdir(parents=True, exist_ok=True) - extension = os.path.splitext(fname)[-1] + extension = fname.suffix if extension in (".html", ".md"): - with open(os.path.join(source_path, fname), encoding="utf-8") as f: + with (source_path / fname).open(encoding="utf-8") as f: content = f.read() if extension == ".md": + toc = TocExtension( + title="Table of Contents", + toc_depth="2-3", + permalink=" #", + ) body = markdown.markdown( - content, extensions=context["main"]["markdown_extensions"] + content, extensions=context["main"]["markdown_extensions"] + [toc] ) # Apply Bootstrap's table formatting manually # Python-Markdown doesn't let us config table attributes by hand body = body.replace("

", '
') content = extend_base_template(body, context["main"]["base_template"]) - context["base_url"] = "".join(["../"] * os.path.normpath(fname).count("/")) + context["base_url"] = "../" * (len(fname.parents) - 1) content = jinja_env.from_string(content).render(**context) - fname_html = os.path.splitext(fname)[0] + ".html" - with open( - os.path.join(target_path, fname_html), "w", encoding="utf-8" - ) as f: + fname_html = fname.with_suffix(".html").name + with (target_path / dirname / fname_html).open("w", encoding="utf-8") as f: f.write(content) else: - shutil.copy( - os.path.join(source_path, fname), os.path.join(target_path, dirname) - ) + shutil.copy(source_path / fname, target_path / fname) if __name__ == "__main__": @@ -495,4 +501,4 @@ def main( "--target-path", default="build", help="directory where to write the output" ) args = parser.parse_args() - sys.exit(main(args.source_path, args.target_path)) + sys.exit(main(pathlib.Path(args.source_path), pathlib.Path(args.target_path)))