diff --git a/.gitattributes b/.gitattributes
index d94c19e7edb1f..bc7dec642df0f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -84,5 +84,3 @@ pandas/tests/io/parser/data export-ignore
# Include cibw script in sdist since it's needed for building wheels
scripts/cibw_before_build.sh -export-ignore
-scripts/cibw_before_build_windows.sh -export-ignore
-scripts/cibw_before_test_windows.sh -export-ignore
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index e430681225cd9..3a7c71af02bf9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,7 +9,6 @@ doc/cheatsheet @Dr-Irv
doc/source/development @noatamir
# pandas
-pandas/_libs/ @WillAyd
pandas/_typing.py @Dr-Irv
pandas/core/groupby/* @rhshadrach
pandas/io/excel/* @rhshadrach
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml
index 6e6cd78ace11d..9c15218794499 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.yaml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yaml
@@ -31,7 +31,7 @@ body:
attributes:
label: Feature Description
description: >
- Please describe how the new feature would be implemented, using psudocode if relevant.
+ Please describe how the new feature would be implemented, using pseudocode if relevant.
placeholder: >
Add a new parameter to DataFrame, to_series, to return a Series if possible.
diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
index b92bacd1a537c..2d208cb38725a 100644
--- a/.github/actions/build_pandas/action.yml
+++ b/.github/actions/build_pandas/action.yml
@@ -4,6 +4,9 @@ inputs:
editable:
description: Whether to build pandas in editable mode (default true)
default: true
+ werror:
+ description: Enable werror flag for build
+ default: true
runs:
using: composite
steps:
@@ -26,9 +29,9 @@ runs:
run: |
if [[ ${{ inputs.editable }} == "true" ]]; then
pip install -e . --no-build-isolation -v --no-deps \
- -Csetup-args="--werror"
+ ${{ inputs.werror == 'true' && '-Csetup-args="--werror"' || '' }}
else
pip install . --no-build-isolation -v --no-deps \
- -Csetup-args="--werror"
+ ${{ inputs.werror == 'true' && '-Csetup-args="--werror"' || '' }}
fi
shell: bash -el {0}
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index e1d2d1ea846b8..728019b06e053 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -21,7 +21,7 @@ permissions:
jobs:
docstring_typing_manual_hooks:
name: Docstring validation, typing, and other manual pre-commit hooks
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
defaults:
run:
shell: bash -el {0}
@@ -102,7 +102,7 @@ jobs:
asv-benchmarks:
name: ASV Benchmarks
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
defaults:
run:
shell: bash -el {0}
@@ -133,7 +133,7 @@ jobs:
build_docker_dev_environment:
name: Build Docker Dev Environment
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
defaults:
run:
shell: bash -el {0}
@@ -160,7 +160,7 @@ jobs:
requirements-dev-text-installable:
name: Test install requirements-dev.txt
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4d0066bc0b48d..44a9b4bfa20b8 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -13,7 +13,7 @@ permissions:
jobs:
analyze:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
permissions:
actions: read
contents: read
diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml
index 62956f5825782..b843363ae8c4d 100644
--- a/.github/workflows/comment-commands.yml
+++ b/.github/workflows/comment-commands.yml
@@ -10,7 +10,7 @@ permissions:
jobs:
issue_assign:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
if: (!github.event.issue.pull_request) && github.event.comment.body == 'take'
concurrency:
group: ${{ github.actor }}-issue-assign
@@ -19,7 +19,7 @@ jobs:
echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
preview_docs:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
if: github.event.issue.pull_request && github.event.comment.body == '/preview'
concurrency:
group: ${{ github.actor }}-preview-docs
@@ -29,7 +29,7 @@ jobs:
previewer-server: "https://pandas.pydata.org/preview"
artifact-job: "Doc Build and Upload"
asv_run:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
# TODO: Support more benchmarking options later, against different branches, against self, etc
if: github.event.issue.pull_request && startsWith(github.event.comment.body, '@github-actions benchmark')
defaults:
diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml
index 3d4cab7be09c5..334a5d77b407b 100644
--- a/.github/workflows/deprecation-tracking-bot.yml
+++ b/.github/workflows/deprecation-tracking-bot.yml
@@ -17,7 +17,7 @@ jobs:
deprecation_update:
permissions:
issues: write
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
env:
DEPRECATION_TRACKER_ISSUE: 56596
steps:
diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml
index 294334ca1d54b..ba9e30e088c66 100644
--- a/.github/workflows/docbuild-and-upload.yml
+++ b/.github/workflows/docbuild-and-upload.yml
@@ -23,7 +23,7 @@ permissions:
jobs:
web_and_docs:
name: Doc Build and Upload
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
index 331af6e05b650..9800cc1694313 100644
--- a/.github/workflows/package-checks.yml
+++ b/.github/workflows/package-checks.yml
@@ -21,7 +21,7 @@ defaults:
jobs:
pip:
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
strategy:
matrix:
extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "all"]
@@ -50,7 +50,7 @@ jobs:
shell: bash -el {0}
conda_forge_recipe:
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
strategy:
matrix:
python-version: ['3.10', '3.11']
diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml
index 792afe8f4faf5..3a51dbefc6bb0 100644
--- a/.github/workflows/stale-pr.yml
+++ b/.github/workflows/stale-pr.yml
@@ -12,7 +12,7 @@ jobs:
permissions:
pull-requests: write
if: github.repository_owner == 'pandas-dev'
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
steps:
- uses: actions/stale@v9
with:
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 08c41a1eeb21f..59512ddc91a8a 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -26,8 +26,8 @@ jobs:
timeout-minutes: 90
strategy:
matrix:
- platform: [ubuntu-22.04, ubuntu-24.04-arm]
- env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
+ platform: [ubuntu-24.04, ubuntu-24.04-arm]
+ env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml]
# Prevent the include jobs from overriding other jobs
pattern: [""]
pandas_future_infer_string: ["0"]
@@ -36,11 +36,15 @@ jobs:
env_file: actions-311-downstream_compat.yaml
pattern: "not slow and not network and not single_cpu"
pytest_target: "pandas/tests/test_downstream.py"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Minimum Versions"
env_file: actions-310-minimum_versions.yaml
pattern: "not slow and not network and not single_cpu"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
+ - name: "Freethreading"
+ env_file: actions-313-freethreading.yaml
+ pattern: "not slow and not network and not single_cpu"
+ platform: ubuntu-24.04
- name: "Locale: it_IT"
env_file: actions-311.yaml
pattern: "not slow and not network and not single_cpu"
@@ -51,7 +55,7 @@ jobs:
# Also install it_IT (its encoding is ISO8859-1) but do not activate it.
# It will be temporarily activated during tests with locale.setlocale
extra_loc: "it_IT"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Locale: zh_CN"
env_file: actions-311.yaml
pattern: "not slow and not network and not single_cpu"
@@ -62,30 +66,30 @@ jobs:
# Also install zh_CN (its encoding is gb2312) but do not activate it.
# It will be temporarily activated during tests with locale.setlocale
extra_loc: "zh_CN"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Future infer strings"
env_file: actions-312.yaml
pandas_future_infer_string: "1"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Future infer strings (without pyarrow)"
env_file: actions-311.yaml
pandas_future_infer_string: "1"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Pypy"
env_file: actions-pypy-39.yaml
pattern: "not slow and not network and not single_cpu"
test_args: "--max-worker-restart 0"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Numpy Dev"
env_file: actions-311-numpydev.yaml
pattern: "not slow and not network and not single_cpu"
test_args: "-W error::DeprecationWarning -W error::FutureWarning"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Pyarrow Nightly"
env_file: actions-311-pyarrownightly.yaml
pattern: "not slow and not network and not single_cpu"
pandas_future_infer_string: "1"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
fail-fast: false
name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }}
env:
@@ -165,6 +169,9 @@ jobs:
- name: Build Pandas
id: build
uses: ./.github/actions/build_pandas
+ with:
+ # xref https://github.com/cython/cython/issues/6870
+ werror: ${{ matrix.name != 'Freethreading' }}
# TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge
if: ${{ matrix.name != 'Pypy' }}
@@ -188,7 +195,7 @@ jobs:
matrix:
# Note: Don't use macOS latest since macos 14 appears to be arm64 only
os: [macos-13, macos-14, windows-latest]
- env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
+ env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml]
fail-fast: false
runs-on: ${{ matrix.os }}
name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
@@ -220,7 +227,7 @@ jobs:
uses: ./.github/actions/run-tests
Linux-32-bit:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
container:
image: quay.io/pypa/manylinux2014_i686
options: --platform linux/386
@@ -241,12 +248,14 @@ jobs:
fi
- name: Build environment and Run Tests
# https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388
+ # Note: Pinned to Cython 3.0.10 to avoid numerical instability in 32-bit environments
+ # https://github.com/pandas-dev/pandas/pull/61423
run: |
/opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev
. ~/virtualenvs/pandas-dev/bin/activate
python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
python -m pip install numpy -Csetup-args="-Dallow-noblas=true"
- python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
+ python -m pip install --no-cache-dir versioneer[toml] cython==3.0.10 python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror"
python -m pip list --no-cache-dir
PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml
@@ -256,7 +265,7 @@ jobs:
cancel-in-progress: true
Linux-Musl:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
container:
image: quay.io/pypa/musllinux_1_2_x86_64
steps:
@@ -316,7 +325,7 @@ jobs:
# To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
# to the corresponding posix/windows-macos/sdist etc. workflows.
# Feel free to modify this comment as necessary.
- # if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+ if: false
defaults:
run:
shell: bash -eou pipefail {0}
@@ -325,7 +334,7 @@ jobs:
fail-fast: false
matrix:
# Separate out macOS 13 and 14, since macOS 14 is arm64 only
- os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest]
+ os: [ubuntu-24.04, macOS-13, macOS-14, windows-latest]
timeout-minutes: 90
@@ -362,48 +371,6 @@ jobs:
- name: Run Tests
uses: ./.github/actions/run-tests
- python-freethreading:
- defaults:
- run:
- shell: bash -eou pipefail {0}
- runs-on: ubuntu-22.04
-
- timeout-minutes: 90
-
- concurrency:
- # https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-python-freethreading-dev
- cancel-in-progress: true
-
- env:
- PYTEST_WORKERS: "auto"
- PANDAS_CI: 1
- PATTERN: "not slow and not network and not clipboard and not single_cpu"
- PYTEST_TARGET: pandas
-
- steps:
- - uses: actions/checkout@v4
- with:
- fetch-depth: 0
-
- - name: Set up Python Free-threading Version
- uses: deadsnakes/action@v3.2.0
- with:
- python-version: 3.13-dev
- nogil: true
-
- - name: Build Environment
- run: |
- python --version
- python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1
- python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
- python -m pip install versioneer[toml] python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov
- python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror"
- python -m pip list
-
- - name: Run Tests
- uses: ./.github/actions/run-tests
-
# NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml
emscripten:
# Note: the Python version, Emscripten toolchain version are determined
@@ -413,7 +380,7 @@ jobs:
# The Node.js version can be determined via Pyodide:
# https://pyodide.org/en/stable/usage/index.html#node-js
name: Pyodide build
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-wasm
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 2dcc79085734b..4de7aec4f551a 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -40,7 +40,7 @@ jobs:
(github.event_name == 'pull_request' &&
contains(github.event.pull_request.labels.*.name, 'Build')) ||
(github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
env:
IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
@@ -92,26 +92,30 @@ jobs:
# GitHub Actions doesn't support pairing matrix values together, let's improvise
# https://github.com/github/feedback/discussions/7835#discussioncomment-1769026
buildplat:
- - [ubuntu-22.04, manylinux_x86_64]
- - [ubuntu-22.04, musllinux_x86_64]
+ - [ubuntu-24.04, manylinux_x86_64]
+ - [ubuntu-24.04, musllinux_x86_64]
- [ubuntu-24.04-arm, manylinux_aarch64]
+ - [ubuntu-24.04-arm, musllinux_aarch64]
- [macos-13, macosx_x86_64]
# Note: M1 images on Github Actions start from macOS 14
- [macos-14, macosx_arm64]
- [windows-2022, win_amd64]
+ - [windows-11-arm, win_arm64]
# TODO: support PyPy?
python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]]
include:
- # TODO: Remove this plus installing build deps in cibw_before_build.sh
- # after pandas can be built with a released NumPy/Cython
- - python: ["cp313t", "3.13"]
- cibw_build_frontend: 'pip; args: --no-build-isolation'
# Build Pyodide wheels and upload them to Anaconda.org
# NOTE: this job is similar to the one in unit-tests.yml except for the fact
# that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup.
- - buildplat: [ubuntu-22.04, pyodide_wasm32]
+ - buildplat: [ubuntu-24.04, pyodide_wasm32]
python: ["cp312", "3.12"]
cibw_build_frontend: 'build'
+ exclude:
+ - buildplat: [windows-11-arm, win_arm64]
+ python: ["cp310", "3.10"]
+ # BackendUnavailable: Cannot import 'mesonpy'
+ - buildplat: [windows-11-arm, win_arm64]
+ python: ["cp313t", "3.13"]
env:
IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
@@ -122,6 +126,12 @@ jobs:
with:
fetch-depth: 0
+ - name: Set up MSVC environment for ARM64
+ if: matrix.buildplat[1] == 'win_arm64'
+ uses: ilammy/msvc-dev-cmd@v1
+ with:
+ arch: arm64
+
# TODO: Build wheels from sdist again
# There's some sort of weird race condition?
# within Github that makes the sdist be missing files
@@ -153,15 +163,19 @@ jobs:
run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
- name: Build wheels
- uses: pypa/cibuildwheel@v2.23.1
+ uses: pypa/cibuildwheel@v2.23.3
with:
package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
env:
CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }}
- CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }}
+ CIBW_PLATFORM: ${{ (matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide') || (matrix.buildplat[1] == 'win_arm64' && 'windows') || 'auto' }}
+ CIBW_ARCHS: ${{ matrix.buildplat[1] == 'win_arm64' && 'ARM64' || 'auto' }}
+ CIBW_BEFORE_BUILD_WINDOWS: 'python -m pip install delvewheel'
- - name: Set up Python
+ - name: Set up Python for validation/upload (non-ARM64 Windows & other OS)
+ # micromamba is not available for ARM64 Windows
+ if: matrix.buildplat[1] != 'win_arm64'
uses: mamba-org/setup-micromamba@v2
with:
environment-name: wheel-env
@@ -174,6 +188,12 @@ jobs:
cache-downloads: true
cache-environment: true
+ - name: Install wheel for win_arm64
+ # installing wheel here because micromamba step was skipped
+ if: matrix.buildplat[1] == 'win_arm64'
+ shell: bash -el {0}
+ run: python -m pip install wheel
+
- name: Validate wheel RECORD
shell: bash -el {0}
run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 09bfda1755e03..b5856810b749e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-minimum_pre_commit_version: 2.15.0
+minimum_pre_commit_version: 4.0.0
exclude: ^LICENSES/|\.(html|csv|svg)$
# reserve "manual" for relatively slow hooks which we still want to run in CI
default_stages: [
@@ -19,13 +19,13 @@ ci:
skip: [pyright, mypy]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.9.9
+ rev: v0.11.12
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
exclude: ^pandas/tests/frame/test_query_eval.py
- id: ruff
- # TODO: remove autofixe-only rules when they are checked by ruff
+ # TODO: remove autofix only rules when they are checked by ruff
name: ruff-selected-autofixes
alias: ruff-selected-autofixes
files: ^pandas
@@ -34,7 +34,7 @@ repos:
- id: ruff-format
exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
- repo: https://github.com/jendrikseipp/vulture
- rev: 'v2.14'
+ rev: v2.14
hooks:
- id: vulture
entry: python scripts/run_vulture.py
@@ -74,7 +74,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/asottile/pyupgrade
- rev: v3.19.1
+ rev: v3.20.0
hooks:
- id: pyupgrade
args: [--py310-plus]
@@ -95,14 +95,14 @@ repos:
- id: sphinx-lint
args: ["--enable", "all", "--disable", "line-too-long"]
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: v19.1.7
+ rev: v20.1.5
hooks:
- id: clang-format
files: ^pandas/_libs/src|^pandas/_libs/include
args: [-i]
types_or: [c, c++]
- repo: https://github.com/trim21/pre-commit-mirror-meson
- rev: v1.7.0
+ rev: v1.8.1
hooks:
- id: meson-fmt
args: ['--inplace']
@@ -140,7 +140,7 @@ repos:
pass_filenames: false
types: [python]
stages: [manual]
- - id: mypy
+ - id: stubtest
# note: assumes python env is setup and activated
# note: requires pandas dev to be installed
name: mypy (stubtest)
diff --git a/Dockerfile b/Dockerfile
index 4090a4adb1af8..e778312fd3aa2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,16 +1,20 @@
FROM python:3.10.8
WORKDIR /home/pandas
-RUN apt-get update && apt-get -y upgrade
-RUN apt-get install -y build-essential bash-completion
+RUN apt-get update && \
+ apt-get --no-install-recommends -y upgrade && \
+ apt-get --no-install-recommends -y install \
+ build-essential \
+ bash-completion \
+ # hdf5 needed for pytables installation
+ libhdf5-dev \
+ # libgles2-mesa needed for pytest-qt
+ libgles2-mesa-dev && \
+ rm -rf /var/lib/apt/lists/*
-# hdf5 needed for pytables installation
-# libgles2-mesa needed for pytest-qt
-RUN apt-get install -y libhdf5-dev libgles2-mesa-dev
-
-RUN python -m pip install --upgrade pip
COPY requirements-dev.txt /tmp
-RUN python -m pip install -r /tmp/requirements-dev.txt
+RUN python -m pip install --no-cache-dir --upgrade pip && \
+ python -m pip install --no-cache-dir -r /tmp/requirements-dev.txt
RUN git config --global --add safe.directory /home/pandas
ENV SHELL="/bin/bash"
diff --git a/MANIFEST.in b/MANIFEST.in
index c59151f340545..a7d7d7eb4e062 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -65,5 +65,3 @@ graft pandas/_libs/include
# Include cibw script in sdist since it's needed for building wheels
include scripts/cibw_before_build.sh
-include scripts/cibw_before_build_windows.sh
-include scripts/cibw_before_test_windows.sh
diff --git a/README.md b/README.md
index 1a273fdb896c5..ebab2e6016850 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
-----------------
-# pandas: powerful Python data analysis toolkit
+# pandas: A Powerful Python Data Analysis Toolkit
| | |
| --- | --- |
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index 30c692115eab1..d286e57ce6b51 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -42,7 +42,7 @@
// followed by the pip installed packages).
"matrix": {
"pip+build": [],
- "Cython": ["3.0"],
+ "Cython": [],
"matplotlib": [],
"sqlalchemy": [],
"scipy": [],
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 6a2ab24df26fe..cd7851acae3f2 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -517,7 +517,7 @@ def setup(self):
self.df = DataFrame(np.random.randn(1000, 100))
self.s = Series(np.arange(1028.0))
- self.df2 = DataFrame({i: self.s for i in range(1028)})
+ self.df2 = DataFrame(dict.fromkeys(range(1028), self.s))
self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_apply_user_func(self):
diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
index 5e3c593e269cb..da0e7de585391 100644
--- a/asv_bench/benchmarks/indexing_engines.py
+++ b/asv_bench/benchmarks/indexing_engines.py
@@ -67,6 +67,14 @@ class NumericEngineIndexing:
def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype
+ if (
+ index_type == "non_monotonic"
+ and dtype in [np.int16, np.int8, np.uint8]
+ and unique
+ ):
+ # Values overflow
+ raise NotImplementedError
+
if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype)
@@ -115,6 +123,14 @@ def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype
dtype = dtype.lower()
+ if (
+ index_type == "non_monotonic"
+ and dtype in ["int16", "int8", "uint8"]
+ and unique
+ ):
+ # Values overflow
+ raise NotImplementedError
+
if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype)
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 2c32eb4f0c584..a0d23aa0478d2 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -72,9 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
-i "pandas.Period.freq GL08" \
-i "pandas.Period.ordinal GL08" \
- -i "pandas.Timestamp.max PR02" \
- -i "pandas.Timestamp.min PR02" \
- -i "pandas.Timestamp.resolution PR02" \
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
-i "pandas.core.resample.Resampler.quantile PR01,PR07" \
diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml
index c7c72828db481..9f12fe941d488 100644
--- a/ci/deps/actions-310-minimum_versions.yaml
+++ b/ci/deps/actions-310-minimum_versions.yaml
@@ -8,7 +8,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -18,46 +18,46 @@ dependencies:
- pytest-xdist>=3.4.0
- pytest-localserver>=0.8.1
- pytest-qt>=4.4.0
- - boto3
+ - boto3=1.37.3
# required dependencies
- python-dateutil=2.8.2
- numpy=1.23.5
# optional dependencies
- - beautifulsoup4=4.11.2
- - blosc=1.21.3
+ - beautifulsoup4=4.12.3
- bottleneck=1.3.6
- - fastparquet=2023.10.0
- - fsspec=2022.11.0
+ - fastparquet=2024.2.0
+ - fsspec=2023.12.2
- html5lib=1.1
- hypothesis=6.84.0
- - gcsfs=2022.11.0
- - jinja2=3.1.2
+ - gcsfs=2023.12.2
+ - jinja2=3.1.3
- lxml=4.9.2
- - matplotlib=3.6.3
- - numba=0.56.4
- - numexpr=2.8.4
+ - matplotlib=3.8.3
+ - numba=0.59.0
+ - numexpr=2.9.0
- odfpy=1.4.1
- qtpy=2.3.0
- - openpyxl=3.1.0
+ - openpyxl=3.1.2
- psycopg2=2.9.6
- pyarrow=10.0.1
- - pymysql=1.0.2
+ - pyiceberg=0.7.1
+ - pymysql=1.1.0
- pyqt=5.15.9
- - pyreadstat=1.2.0
+ - pyreadstat=1.2.6
- pytables=3.8.0
- python-calamine=0.1.7
- pytz=2023.4
- pyxlsb=1.0.10
- - s3fs=2022.11.0
- - scipy=1.10.0
+ - s3fs=2023.12.2
+ - scipy=1.12.0
- sqlalchemy=2.0.0
- tabulate=0.9.0
- - xarray=2022.12.0
+ - xarray=2024.1.1
- xlrd=2.0.1
- - xlsxwriter=3.0.5
- - zstandard=0.19.0
+ - xlsxwriter=3.2.0
+ - zstandard=0.22.0
- pip:
- adbc-driver-postgresql==0.10.0
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index 74cab4e0970dc..66d49475bf34b 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -6,7 +6,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -16,46 +16,46 @@ dependencies:
- pytest-xdist>=3.4.0
- pytest-localserver>=0.8.1
- pytest-qt>=4.4.0
- - boto3
+ - boto3=1.37.3
# required dependencies
- python-dateutil
- numpy
# optional dependencies
- - beautifulsoup4>=4.11.2
- - blosc>=1.21.3
+ - beautifulsoup4>=4.12.3
- bottleneck>=1.3.6
- - fastparquet>=2023.10.0
- - fsspec>=2022.11.0
+ - fastparquet>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2022.11.0
- - jinja2>=3.1.2
+ - gcsfs>=2023.12.2
+ - jinja2>=3.1.3
- lxml>=4.9.2
- - matplotlib>=3.6.3
- - numba>=0.56.4
- - numexpr>=2.8.4
+ - matplotlib>=3.8.3
+ - numba>=0.59.0
+ - numexpr>=2.9.0
- odfpy>=1.4.1
- qtpy>=2.3.0
- - openpyxl>=3.1.0
+ - openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- - pymysql>=1.0.2
+ - pyiceberg>=0.7.1
+ - pymysql>=1.1.0
- pyqt>=5.15.9
- - pyreadstat>=1.2.0
+ - pyreadstat>=1.2.6
- pytables>=3.8.0
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2022.11.0
- - scipy>=1.10.0
+ - s3fs>=2023.12.2
+ - scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2022.12.0, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- - xlsxwriter>=3.0.5
- - zstandard>=0.19.0
+ - xlsxwriter>=3.2.0
+ - zstandard>=0.22.0
- pip:
- adbc-driver-postgresql>=0.10.0
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 092ca18d61259..100a250f0bf01 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -7,7 +7,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -17,60 +17,58 @@ dependencies:
- pytest-xdist>=3.4.0
- pytest-localserver>=0.8.1
- pytest-qt>=4.4.0
- - boto3
+ - boto3=1.37.3
# required dependencies
- python-dateutil
- numpy
# optional dependencies
- - beautifulsoup4>=4.11.2
- - blosc>=1.21.3
+ - beautifulsoup4>=4.12.3
- bottleneck>=1.3.6
- - fastparquet>=2023.10.0
- - fsspec>=2022.11.0
+ - fastparquet>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2022.11.0
- - jinja2>=3.1.2
+ - gcsfs>=2023.12.2
+ - jinja2>=3.1.3
- lxml>=4.9.2
- - matplotlib>=3.6.3
- - numba>=0.56.4
- - numexpr>=2.8.4
+ - matplotlib>=3.8.3
+ - numba>=0.59.0
+ - numexpr>=2.9.0
- odfpy>=1.4.1
- qtpy>=2.3.0
- - openpyxl>=3.1.0
+ - openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- - pymysql>=1.0.2
+ - pyiceberg>=0.7.1
+ - pymysql>=1.1.0
- pyqt>=5.15.9
- - pyreadstat>=1.2.0
+ - pyreadstat>=1.2.6
- pytables>=3.8.0
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2022.11.0
- - scipy>=1.10.0
+ - s3fs>=2023.12.2
+ - scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2022.12.0, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- - xlsxwriter>=3.0.5
- - zstandard>=0.19.0
+ - xlsxwriter>=3.2.0
+ - zstandard>=0.22.0
# downstream packages
- botocore
- cftime
- dask
- ipython
- - geopandas-base
- seaborn
- scikit-learn
- statsmodels
- coverage
- pandas-datareader
- pyyaml
- - py
- pip:
- adbc-driver-postgresql>=0.10.0
- adbc-driver-sqlite>=0.8.0
diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml
index 325a6d45d74fd..99cbe0415b4f9 100644
--- a/ci/deps/actions-311-numpydev.yaml
+++ b/ci/deps/actions-311-numpydev.yaml
@@ -8,7 +8,7 @@ dependencies:
- versioneer
- meson=1.2.1
- meson-python=0.13.1
- - cython>=0.29.33
+ - cython<4.0.0a0
# test dependencies
- pytest>=7.3.2
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
index 2d3d11c294e12..da0cecda0fb46 100644
--- a/ci/deps/actions-311-pyarrownightly.yaml
+++ b/ci/deps/actions-311-pyarrownightly.yaml
@@ -7,7 +7,7 @@ dependencies:
# build dependencies
- versioneer
- meson=1.2.1
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson-python=0.13.1
# test dependencies
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index b6f515dceaea9..9669c1e29a435 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -6,7 +6,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -16,46 +16,46 @@ dependencies:
- pytest-xdist>=3.4.0
- pytest-localserver>=0.8.1
- pytest-qt>=4.4.0
- - boto3
+ - boto3=1.37.3
# required dependencies
- python-dateutil
- numpy
# optional dependencies
- - beautifulsoup4>=4.11.2
- - blosc>=1.21.3
+ - beautifulsoup4>=4.12.3
- bottleneck>=1.3.6
- - fastparquet>=2023.10.0
- - fsspec>=2022.11.0
+ - fastparquet>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2022.11.0
- - jinja2>=3.1.2
+ - gcsfs>=2023.12.2
+ - jinja2>=3.1.3
- lxml>=4.9.2
- - matplotlib>=3.6.3
- - numba>=0.56.4
- - numexpr>=2.8.4
+ - matplotlib>=3.8.3
+ - numba>=0.59.0
+ - numexpr>=2.9.0
- odfpy>=1.4.1
- qtpy>=2.3.0
- pyqt>=5.15.9
- - openpyxl>=3.1.0
+ - openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- - pymysql>=1.0.2
- - pyreadstat>=1.2.0
+ - pyiceberg>=0.7.1
+ - pymysql>=1.1.0
+ - pyreadstat>=1.2.6
- pytables>=3.8.0
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2022.11.0
- - scipy>=1.10.0
+ - s3fs>=2023.12.2
+ - scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2022.12.0, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- - xlsxwriter>=3.0.5
- - zstandard>=0.19.0
+ - xlsxwriter>=3.2.0
+ - zstandard>=0.22.0
- pip:
- adbc-driver-postgresql>=0.10.0
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
index bc66f8a5382c9..61f1d602bb241 100644
--- a/ci/deps/actions-312.yaml
+++ b/ci/deps/actions-312.yaml
@@ -6,7 +6,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -16,46 +16,46 @@ dependencies:
- pytest-xdist>=3.4.0
- pytest-localserver>=0.8.1
- pytest-qt>=4.4.0
- - boto3
+ - boto3=1.37.3
# required dependencies
- python-dateutil
- numpy
# optional dependencies
- - beautifulsoup4>=4.11.2
- - blosc>=1.21.3
+ - beautifulsoup4>=4.12.3
- bottleneck>=1.3.6
- - fastparquet>=2023.10.0
- - fsspec>=2022.11.0
+ - fastparquet>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2022.11.0
- - jinja2>=3.1.2
+ - gcsfs>=2023.12.2
+ - jinja2>=3.1.3
- lxml>=4.9.2
- - matplotlib>=3.6.3
- - numba>=0.56.4
- - numexpr>=2.8.4
+ - matplotlib>=3.8.3
+ - numba>=0.59.0
+ - numexpr>=2.9.0
- odfpy>=1.4.1
- qtpy>=2.3.0
- pyqt>=5.15.9
- - openpyxl>=3.1.0
+ - openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- - pymysql>=1.0.2
- - pyreadstat>=1.2.0
+ - pyiceberg>=0.7.1
+ - pymysql>=1.1.0
+ - pyreadstat>=1.2.6
- pytables>=3.8.0
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2022.11.0
- - scipy>=1.10.0
+ - s3fs>=2023.12.2
+ - scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2022.12.0, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- - xlsxwriter>=3.0.5
- - zstandard>=0.19.0
+ - xlsxwriter>=3.2.0
+ - zstandard>=0.22.0
- pip:
- adbc-driver-postgresql>=0.10.0
diff --git a/ci/deps/actions-313-freethreading.yaml b/ci/deps/actions-313-freethreading.yaml
new file mode 100644
index 0000000000000..14e3ade976b01
--- /dev/null
+++ b/ci/deps/actions-313-freethreading.yaml
@@ -0,0 +1,29 @@
+name: pandas-dev-313-freethreading
+channels:
+ - conda-forge
+dependencies:
+ - python-freethreading
+
+ # build dependencies
+ - setuptools
+ - versioneer
+ - cython<4.0.0a0
+ - meson=1.8.0
+ - meson-python=0.18.0
+
+ # test dependencies
+ - pytest>=7.3.2
+ - pytest-xdist>=3.4.0
+
+ # required dependencies
+ - python-dateutil
+ - numpy
+
+ # optional dependencies
+ - hypothesis>=6.84.0
+
+ - pip:
+ # No free-threaded coveragepy (with the C-extension) on conda-forge yet
+ - pytest-cov
+ - "tzdata>=2022.7"
+ - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
diff --git a/ci/deps/actions-313.yaml b/ci/deps/actions-313.yaml
new file mode 100644
index 0000000000000..11f4428be27e5
--- /dev/null
+++ b/ci/deps/actions-313.yaml
@@ -0,0 +1,63 @@
+name: pandas-dev-313
+channels:
+ - conda-forge
+dependencies:
+ - python=3.13
+
+ # build dependencies
+ - versioneer
+ - cython<4.0.0a0
+ - meson=1.2.1
+ - meson-python=0.13.1
+
+ # test dependencies
+ - pytest>=7.3.2
+ - pytest-cov
+ - pytest-xdist>=3.4.0
+ - pytest-localserver>=0.8.1
+ - pytest-qt>=4.4.0
+ - boto3=1.37.3
+
+ # required dependencies
+ - python-dateutil
+ - numpy
+
+ # optional dependencies
+ - beautifulsoup4>=4.12.3
+ - blosc>=1.21.3
+ - bottleneck>=1.3.6
+ - fastparquet>=2024.2.0
+ - fsspec>=2023.12.2
+ - html5lib>=1.1
+ - hypothesis>=6.84.0
+ - gcsfs>=2023.12.2
+ - jinja2>=3.1.3
+ - lxml>=4.9.2
+ - matplotlib>=3.8.3
+ - numba>=0.59.0
+ - numexpr>=2.9.0
+ - odfpy>=1.4.1
+ - qtpy>=2.3.0
+ - pyqt>=5.15.9
+ - openpyxl>=3.1.2
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
+ - pymysql>=1.1.0
+ - pyreadstat>=1.2.6
+ - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pytz>=2023.4
+ - pyxlsb>=1.0.10
+ - s3fs>=2023.12.2
+ - scipy>=1.12.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2024.1.1
+ - xlrd>=2.0.1
+ - xlsxwriter>=3.2.0
+ - zstandard>=0.22.0
+
+ - pip:
+ - adbc-driver-postgresql>=0.10.0
+ - adbc-driver-sqlite>=0.8.0
+ - tzdata>=2022.7
diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml
index 90933b24b88db..e0ddc6954e4a4 100644
--- a/ci/deps/actions-pypy-39.yaml
+++ b/ci/deps/actions-pypy-39.yaml
@@ -9,7 +9,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf
index 3582e0c0dabf9..33fbf2507ed62 100644
Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx
index 746f508516964..5ce2e3be48d55 100644
Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ
diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md
index b8599acff2f6e..b72c093b4ba2f 100644
--- a/doc/cheatsheet/README.md
+++ b/doc/cheatsheet/README.md
@@ -12,7 +12,7 @@ This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](http
| Pandas_Cheat_Sheet_JA | Japanese | | |
| Pandas_Cheat_Sheet_FA | Persian | | |
-
+The English version has additional material that is not in the versions in other languages.
**Alternative**
diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css
index b02311eb66080..55141f8955066 100644
--- a/doc/source/_static/css/getting_started.css
+++ b/doc/source/_static/css/getting_started.css
@@ -249,6 +249,7 @@ ul.task-bullet > li > p:first-child {
.tutorial-card .card-header {
--bs-card-cap-color: var(--pst-color-text-base);
+ color: var(--pst-color-text-base);
cursor: pointer;
background-color: var(--pst-color-surface);
border: 1px solid var(--pst-color-border)
@@ -256,6 +257,7 @@ ul.task-bullet > li > p:first-child {
.tutorial-card .card-body {
background-color: var(--pst-color-on-background);
+ color: var(--pst-color-text-base);
}
.tutorial-card .badge {
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 677ee6274b093..f222a228531ff 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -114,6 +114,8 @@
):
exclude_patterns.append(rel_fname)
elif single_doc and rel_fname != pattern:
+ if "\\" in rel_fname:
+ rel_fname = rel_fname.replace("\\", "/")
exclude_patterns.append(rel_fname)
with open(os.path.join(source_path, "index.rst.template"), encoding="utf-8") as f:
diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst
index ab8294b8f135a..1c698d130ea6c 100644
--- a/doc/source/development/community.rst
+++ b/doc/source/development/community.rst
@@ -77,7 +77,7 @@ Any community member can open issues to:
- Ask questions, e.g. "I noticed the behavior of a certain function
changed between versions. Is this expected?".
- Ideally, your questions should be related to how pandas works rather
+ - Ideally, your questions should be related to how pandas works rather
than how you use pandas. `StackOverflow `_ is
better suited for answering usage questions, and we ask that all usage
questions are first asked on StackOverflow. Thank you for respecting our
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index 4d99f282aa695..66178a88e3e31 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -234,14 +234,14 @@ and merged into project to appear the in the next release. To submit a pull requ
#. Write a descriptive title that includes prefixes. pandas uses a convention for title
prefixes. Here are some common ones along with general guidelines for when to use them:
- * ENH: Enhancement, new functionality
- * BUG: Bug fix
- * DOC: Additions/updates to documentation
- * TST: Additions/updates to tests
- * BLD: Updates to the build process/scripts
- * PERF: Performance improvement
- * TYP: Type annotations
- * CLN: Code cleanup
+ * ENH: Enhancement, new functionality
+ * BUG: Bug fix
+ * DOC: Additions/updates to documentation
+ * TST: Additions/updates to tests
+ * BLD: Updates to the build process/scripts
+ * PERF: Performance improvement
+ * TYP: Type annotations
+ * CLN: Code cleanup
#. Write a description of your changes in the ``Preview Discussion`` tab
#. Click ``Send Pull Request``.
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index b8d568428c156..73bc756de9302 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -444,11 +444,11 @@ be located.
result = ser.loc[[3, 4]]
tm.assert_series_equal(result, expected)
- In cases like this, the test location should be based on the *underlying*
- method being tested. Or in the case of a test for a bugfix, the location
- of the actual bug. So in this example, we know that ``Series.__getitem__``
- calls ``Series.loc.__getitem__``, so this is *really* a test for
- ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``.
+ In cases like this, the test location should be based on the *underlying*
+ method being tested. Or in the case of a test for a bugfix, the location
+ of the actual bug. So in this example, we know that ``Series.__getitem__``
+ calls ``Series.loc.__getitem__``, so this is *really* a test for
+ ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``.
6. Is your test for a DataFrame or Series method?
diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index 98bd4b00d016b..d7b779debcd5e 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -251,7 +251,7 @@ This option allows you to configure where meson stores your built C extensions,
Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions.
Appending ``-Csetup-args="-Ddebug=true"`` will do the trick.
-With pip, it is possible to chain together multiple config settings (for example specifying both a build directory
+With pip, it is possible to chain together multiple config settings. For example, specifying both a build directory
and building with debug symbols would look like
``-Cbuilddir="your builddir here" -Csetup-args="-Dbuildtype=debug"``.
diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst
index b70981b4d307d..447b7b20a8ae5 100644
--- a/doc/source/development/contributing_gitpod.rst
+++ b/doc/source/development/contributing_gitpod.rst
@@ -158,8 +158,8 @@ Option 1: using Liveserve
file and click on **Open with Live Serve**. Alternatively, you can open the
file in the editor and click on the **Go live** button on the status bar.
- .. image:: ./gitpod-imgs/vscode-statusbar.png
- :alt: Gitpod workspace VSCode start live serve screenshot
+ .. image:: ./gitpod-imgs/vscode-statusbar.png
+ :alt: Gitpod workspace VSCode start live serve screenshot
#. A simple browser will open to the right-hand side of the editor. We recommend
closing it and click on the **Open in browser** button in the pop-up.
@@ -182,13 +182,13 @@ uses the rst extension with docutils.
:kbd:`Ctrl-Shift-P` in Linux and Windows. Start typing "restructured"
and choose either "Open preview" or "Open preview to the Side".
- .. image:: ./gitpod-imgs/vscode-rst.png
- :alt: Gitpod workspace VSCode open rst screenshot
+ .. image:: ./gitpod-imgs/vscode-rst.png
+ :alt: Gitpod workspace VSCode open rst screenshot
#. As you work on the document, you will see a live rendering of it on the editor.
- .. image:: ./gitpod-imgs/rst-rendering.png
- :alt: Gitpod workspace VSCode rst rendering screenshot
+ .. image:: ./gitpod-imgs/rst-rendering.png
+ :alt: Gitpod workspace VSCode rst rendering screenshot
If you want to see the final output with the ``html`` theme you will need to
rebuild the docs with ``make html`` and use Live Serve as described in option 1.
diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst
index 0ea1c112cb55b..c8127e0cc2996 100644
--- a/doc/source/development/debugging_extensions.rst
+++ b/doc/source/development/debugging_extensions.rst
@@ -10,9 +10,9 @@ pandas uses Cython and C/C++ `extension modules `_
- 2. `Fundamental Python Debugging Part 2 - Python Extensions `_
- 3. `Fundamental Python Debugging Part 3 - Cython Extensions `_
+1. `Fundamental Python Debugging Part 1 - Python `_
+2. `Fundamental Python Debugging Part 2 - Python Extensions `_
+3. `Fundamental Python Debugging Part 3 - Cython Extensions `_
Debugging locally
-----------------
@@ -23,7 +23,7 @@ By default building pandas from source will generate a release build. To generat
.. note::
- conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases. If using conda, you may need to set ``CFLAGS="$CFLAGS -O0"`` and ``CPPFLAGS="$CPPFLAGS -O0"`` to ensure optimizations are turned off for debugging
+ conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases, and may work counter towards usage in a development environment. If using conda, you should unset these environment variables via ``export CFLAGS=`` and ``export CPPFLAGS=``
By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types.
diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst
index c5c4b7c449ce7..21a840fbe9a5f 100644
--- a/doc/source/development/developer.rst
+++ b/doc/source/development/developer.rst
@@ -116,19 +116,19 @@ The ``metadata`` field is ``None`` except for:
omitted it is assumed to be nanoseconds.
* ``categorical``: ``{'num_categories': K, 'ordered': is_ordered, 'type': $TYPE}``
- * Here ``'type'`` is optional, and can be a nested pandas type specification
- here (but not categorical)
+ * Here ``'type'`` is optional, and can be a nested pandas type specification
+ here (but not categorical)
* ``unicode``: ``{'encoding': encoding}``
- * The encoding is optional, and if not present is UTF-8
+ * The encoding is optional, and if not present is UTF-8
* ``object``: ``{'encoding': encoding}``. Objects can be serialized and stored
in ``BYTE_ARRAY`` Parquet columns. The encoding can be one of:
- * ``'pickle'``
- * ``'bson'``
- * ``'json'``
+ * ``'pickle'``
+ * ``'bson'``
+ * ``'json'``
* ``timedelta``: ``{'unit': 'ns'}``. The ``'unit'`` is optional, and if omitted
it is assumed to be nanoseconds. This metadata is optional altogether
diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst
index c572559dcc3e0..c37925f7e271a 100644
--- a/doc/source/development/maintaining.rst
+++ b/doc/source/development/maintaining.rst
@@ -218,11 +218,11 @@ pandas supports point releases (e.g. ``1.4.3``) that aim to:
1. Fix bugs in new features introduced in the first minor version release.
- * e.g. If a new feature was added in ``1.4`` and contains a bug, a fix can be applied in ``1.4.3``
+ * e.g. If a new feature was added in ``1.4`` and contains a bug, a fix can be applied in ``1.4.3``
2. Fix bugs that used to work in a few minor releases prior. There should be agreement between core team members that a backport is appropriate.
- * e.g. If a feature worked in ``1.2`` and stopped working since ``1.3``, a fix can be applied in ``1.4.3``.
+ * e.g. If a feature worked in ``1.2`` and stopped working since ``1.3``, a fix can be applied in ``1.4.3``.
Since pandas minor releases are based on GitHub branches (e.g. point release of ``1.4`` are based off the ``1.4.x`` branch),
"backporting" means merging a pull request fix to the ``main`` branch and correct minor branch associated with the next point release.
@@ -289,8 +289,8 @@ The required steps for adding a maintainer are:
1. Contact the contributor and ask their interest to join.
2. Add the contributor to the appropriate `GitHub Team `_ if accepted the invitation.
- * ``pandas-core`` is for core team members
- * ``pandas-triage`` is for pandas triage members
+ * ``pandas-core`` is for core team members
+ * ``pandas-triage`` is for pandas triage members
If adding to ``pandas-core``, there are two additional steps:
@@ -467,10 +467,10 @@ Post-Release
patch releases. The exact instructions are (replace the example version numbers by
the appropriate ones for the version you are releasing):
- - Log in to the server and use the correct user.
- - ``cd /var/www/html/pandas-docs/``
- - ``ln -sfn version/2.1 stable`` (for a major or minor release)
- - ``ln -sfn version/2.0.3 version/2.0`` (for a patch release)
+ - Log in to the server and use the correct user.
+ - ``cd /var/www/html/pandas-docs/``
+ - ``ln -sfn version/2.1 stable`` (for a major or minor release)
+ - ``ln -sfn version/2.0.3 version/2.0`` (for a patch release)
2. If releasing a major or minor release, open a PR in our source code to update
``web/pandas/versions.json``, to have the desired versions in the documentation
@@ -487,8 +487,8 @@ Post-Release
6. Announce the new release in the official channels (use previous announcements
for reference):
- - The pandas-dev and pydata mailing lists
- - X, Mastodon, Telegram and LinkedIn
+ - The pandas-dev and pydata mailing lists
+ - X, Mastodon, Telegram and LinkedIn
7. Update this release instructions to fix anything incorrect and to update about any
change since the last release.
diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst
index d9d7d916b0238..cc7add87b5935 100644
--- a/doc/source/getting_started/comparison/comparison_with_r.rst
+++ b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -383,7 +383,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension.
.. ipython:: python
- a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4)
+ a = np.array(list(range(1, 24)) + [np.nan]).reshape(2, 3, 4)
pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)])
meltlist
@@ -402,7 +402,7 @@ In Python, this list would be a list of tuples, so
.. ipython:: python
- a = list(enumerate(list(range(1, 5)) + [np.NAN]))
+ a = list(enumerate(list(range(1, 5)) + [np.nan]))
pd.DataFrame(a)
For more details and examples see :ref:`the Intro to Data Structures
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index bda959f380e8a..1589fea5f8953 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -183,9 +183,9 @@ Installable with ``pip install "pandas[performance]"``
===================================================== ================== ================== ===================================================================================================================================================================================
Dependency Minimum Version pip extra Notes
===================================================== ================== ================== ===================================================================================================================================================================================
-`numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups
+`numexpr `__ 2.9.0 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups
`bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup.
-`numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler.
+`numba `__ 0.59.0 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler.
===================================================== ================== ================== ===================================================================================================================================================================================
Visualization
@@ -196,8 +196,8 @@ Installable with ``pip install "pandas[plot, output-formatting]"``.
========================================================== ================== ================== =======================================================
Dependency Minimum Version pip extra Notes
========================================================== ================== ================== =======================================================
-`matplotlib `__ 3.6.3 plot Plotting library
-`Jinja2 `__ 3.1.2 output-formatting Conditional formatting with DataFrame.style
+`matplotlib `__ 3.8.3 plot Plotting library
+`Jinja2 `__ 3.1.3 output-formatting Conditional formatting with DataFrame.style
`tabulate `__ 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_)
========================================================== ================== ================== =======================================================
@@ -209,8 +209,8 @@ Installable with ``pip install "pandas[computation]"``.
============================================== ================== =============== =======================================
Dependency Minimum Version pip extra Notes
============================================== ================== =============== =======================================
-`SciPy `__ 1.10.0 computation Miscellaneous statistical functions
-`xarray `__ 2022.12.0 computation pandas-like API for N-dimensional data
+`SciPy `__ 1.12.0 computation Miscellaneous statistical functions
+`xarray `__ 2024.1.1 computation pandas-like API for N-dimensional data
============================================== ================== =============== =======================================
.. _install.excel_dependencies:
@@ -224,8 +224,8 @@ Installable with ``pip install "pandas[excel]"``.
Dependency Minimum Version pip extra Notes
================================================================== ================== =============== =============================================================
`xlrd `__ 2.0.1 excel Reading for xls files
-`xlsxwriter `__ 3.0.5 excel Writing for xlsx files
-`openpyxl `__ 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files
+`xlsxwriter `__ 3.2.0 excel Writing for xlsx files
+`openpyxl `__ 3.1.2 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files
`pyxlsb `__ 1.0.10 excel Reading for xlsb files
`python-calamine `__ 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files
`odfpy `__ 1.4.1 excel Reading / writing for OpenDocument 1.2 files
@@ -239,7 +239,7 @@ Installable with ``pip install "pandas[html]"``.
=============================================================== ================== =============== ==========================
Dependency Minimum Version pip extra Notes
=============================================================== ================== =============== ==========================
-`BeautifulSoup4 `__ 4.11.2 html HTML parser for read_html
+`BeautifulSoup4 `__ 4.12.3 html HTML parser for read_html
`html5lib `__ 1.1 html HTML parser for read_html
`lxml `__ 4.9.2 html HTML parser for read_html
=============================================================== ================== =============== ==========================
@@ -291,7 +291,7 @@ Dependency Minimum Versi
mysql,
sql-other
`psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy
-`pymysql `__ 1.0.2 mysql MySQL engine for sqlalchemy
+`pymysql `__ 1.1.0 mysql MySQL engine for sqlalchemy
`adbc-driver-postgresql `__ 0.10.0 postgresql ADBC Driver for PostgreSQL
`adbc-driver-sqlite `__ 0.8.0 sql-other ADBC Driver for SQLite
================================================================== ================== =============== ============================================
@@ -299,17 +299,17 @@ Dependency Minimum Versi
Other data sources
^^^^^^^^^^^^^^^^^^
-Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"``
+Installable with ``pip install "pandas[hdf5, parquet, iceberg, feather, spss, excel]"``
====================================================== ================== ================ ==========================================================
Dependency Minimum Version pip extra Notes
====================================================== ================== ================ ==========================================================
`PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing
-`blosc `__ 1.21.3 hdf5 Compression for HDF5; only available on ``conda``
`zlib `__ hdf5 Compression for HDF5
-`fastparquet `__ 2023.10.0 - Parquet reading / writing (pyarrow is default)
+`fastparquet `__ 2024.2.0 - Parquet reading / writing (pyarrow is default)
`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing
-`pyreadstat `__ 1.2.0 spss SPSS files (.sav) reading
+`PyIceberg `__ 0.7.1 iceberg Apache Iceberg reading / writing
+`pyreadstat `__ 1.2.6 spss SPSS files (.sav) reading
`odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing
====================================================== ================== ================ ==========================================================
@@ -329,10 +329,10 @@ Installable with ``pip install "pandas[fss, aws, gcp]"``
============================================ ================== =============== ==========================================================
Dependency Minimum Version pip extra Notes
============================================ ================== =============== ==========================================================
-`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required
+`fsspec `__ 2023.12.2 fss, gcp, aws Handling files aside from simple local and HTTP (required
dependency of s3fs, gcsfs).
-`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access
-`s3fs `__ 2022.11.0 aws Amazon S3 access
+`gcsfs `__ 2023.12.2 gcp Google Cloud Storage access
+`s3fs `__ 2023.12.2 aws Amazon S3 access
============================================ ================== =============== ==========================================================
Clipboard
diff --git a/doc/source/getting_started/intro_tutorials/includes/titanic.rst b/doc/source/getting_started/intro_tutorials/includes/titanic.rst
index 6e03b848aab06..41159516200fa 100644
--- a/doc/source/getting_started/intro_tutorials/includes/titanic.rst
+++ b/doc/source/getting_started/intro_tutorials/includes/titanic.rst
@@ -11,7 +11,7 @@ This tutorial uses the Titanic data set, stored as CSV. The data
consists of the following data columns:
- PassengerId: Id of every passenger.
-- Survived: Indication whether passenger survived. ``0`` for yes and ``1`` for no.
+- Survived: Indication whether passenger survived. ``0`` for no and ``1`` for yes.
- Pclass: One out of the 3 ticket classes: Class ``1``, Class ``2`` and Class ``3``.
- Name: Name of passenger.
- Sex: Gender of passenger.
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 5be08f163e6ce..d37eebef5c0c0 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -664,6 +664,7 @@ Data type introspection
api.types.is_datetime64_dtype
api.types.is_datetime64_ns_dtype
api.types.is_datetime64tz_dtype
+ api.types.is_dtype_equal
api.types.is_extension_array_dtype
api.types.is_float_dtype
api.types.is_int64_dtype
diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
index fc180c8161a7e..004651ac0074f 100644
--- a/doc/source/reference/groupby.rst
+++ b/doc/source/reference/groupby.rst
@@ -79,6 +79,8 @@ Function application
DataFrameGroupBy.cumsum
DataFrameGroupBy.describe
DataFrameGroupBy.diff
+ DataFrameGroupBy.ewm
+ DataFrameGroupBy.expanding
DataFrameGroupBy.ffill
DataFrameGroupBy.first
DataFrameGroupBy.head
@@ -130,6 +132,8 @@ Function application
SeriesGroupBy.cumsum
SeriesGroupBy.describe
SeriesGroupBy.diff
+ SeriesGroupBy.ewm
+ SeriesGroupBy.expanding
SeriesGroupBy.ffill
SeriesGroupBy.first
SeriesGroupBy.head
diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
index 805fb8b783459..37d9e7f6b7dbd 100644
--- a/doc/source/reference/io.rst
+++ b/doc/source/reference/io.rst
@@ -156,6 +156,16 @@ Parquet
read_parquet
DataFrame.to_parquet
+Iceberg
+~~~~~~~
+.. autosummary::
+ :toctree: api/
+
+ read_iceberg
+ DataFrame.to_iceberg
+
+.. warning:: ``read_iceberg`` is experimental and may change without warning.
+
ORC
~~~
.. autosummary::
diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
index 72bb93d21a99f..8beaa73090673 100644
--- a/doc/source/user_guide/10min.rst
+++ b/doc/source/user_guide/10min.rst
@@ -178,12 +178,26 @@ Getitem (``[]``)
~~~~~~~~~~~~~~~~
For a :class:`DataFrame`, passing a single label selects a column and
-yields a :class:`Series` equivalent to ``df.A``:
+yields a :class:`Series`:
.. ipython:: python
df["A"]
+If the label only contains letters, numbers, and underscores, you can
+alternatively use the column name attribute:
+
+.. ipython:: python
+
+ df.A
+
+Passing a list of column labels selects multiple columns, which can be useful
+for getting a subset/rearranging:
+
+.. ipython:: python
+
+ df[["B", "A"]]
+
For a :class:`DataFrame`, passing a slice ``:`` selects matching rows:
.. ipython:: python
diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
index 14af5d9dc22c8..8155aa0ae03fa 100644
--- a/doc/source/user_guide/basics.rst
+++ b/doc/source/user_guide/basics.rst
@@ -2064,12 +2064,12 @@ different numeric dtypes will **NOT** be combined. The following example will gi
.. ipython:: python
- df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32")
+ df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float64")
df1
df1.dtypes
df2 = pd.DataFrame(
{
- "A": pd.Series(np.random.randn(8), dtype="float16"),
+ "A": pd.Series(np.random.randn(8), dtype="float32"),
"B": pd.Series(np.random.randn(8)),
"C": pd.Series(np.random.randint(0, 255, size=8), dtype="uint8"), # [0,255] (range of uint8)
}
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
index e55a6cda47ac2..9c37f317a805e 100644
--- a/doc/source/user_guide/enhancingperf.rst
+++ b/doc/source/user_guide/enhancingperf.rst
@@ -50,7 +50,7 @@ We have a :class:`DataFrame` to which we want to apply a function row-wise.
{
"a": np.random.randn(1000),
"b": np.random.randn(1000),
- "N": np.random.randint(100, 1000, (1000)),
+ "N": np.random.randint(100, 1000, (1000), dtype="int64"),
"x": "x",
}
)
@@ -83,7 +83,7 @@ using the `prun ipython magic function `.
* A boolean array.
@@ -1461,16 +1461,33 @@ Looking up values by index/column labels
Sometimes you want to extract a set of values given a sequence of row labels
and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing.
-For instance:
-.. ipython:: python
+For heterogeneous column types, we subset columns to avoid unnecessary NumPy conversions:
+
+.. code-block:: python
+
+ def pd_lookup_het(df, row_labels, col_labels):
+ rows = df.index.get_indexer(row_labels)
+ cols = df.columns.get_indexer(col_labels)
+ sub = df.take(np.unique(cols), axis=1)
+ sub = sub.take(np.unique(rows), axis=0)
+ rows = sub.index.get_indexer(row_labels)
+ values = sub.melt()["value"]
+ cols = sub.columns.get_indexer(col_labels)
+ flat_index = rows + cols * len(sub)
+ result = values[flat_index]
+ return result
+
+For homogeneous column types, it is fastest to skip column subsetting and go directly to NumPy:
+
+.. code-block:: python
- df = pd.DataFrame({'col': ["A", "A", "B", "B"],
- 'A': [80, 23, np.nan, 22],
- 'B': [80, 55, 76, 67]})
- df
- idx, cols = pd.factorize(df['col'])
- df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
+ def pd_lookup_hom(df, row_labels, col_labels):
+ rows = df.index.get_indexer(row_labels)
+ df = df.loc[:, sorted(set(col_labels))]
+ cols = df.columns.get_indexer(col_labels)
+ result = df.to_numpy()[rows, cols]
+ return result
Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method
which was deprecated in version 1.2.0 and removed in version 2.0.0.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 23da52f26358f..25f1e11e6b603 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -26,9 +26,10 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
text, Local clipboard, :ref:`read_clipboard`, :ref:`to_clipboard`
binary,`MS Excel `__ , :ref:`read_excel`, :ref:`to_excel`
binary,`OpenDocument `__, :ref:`read_excel`, NA
- binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf`
+ binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf`
binary,`Feather Format `__, :ref:`read_feather`, :ref:`to_feather`
binary,`Parquet Format `__, :ref:`read_parquet`, :ref:`to_parquet`
+ binary,`Apache Iceberg `__, :ref:`read_iceberg` , :ref:`to_iceberg`
binary,`ORC Format `__, :ref:`read_orc`, :ref:`to_orc`
binary,`Stata `__, :ref:`read_stata`, :ref:`to_stata`
binary,`SAS `__, :ref:`read_sas` , NA
@@ -1414,7 +1415,7 @@ of multi-columns indices.
.. note::
If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
- with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will
+ with ``df.to_csv(..., index=False)``), then any ``names`` on the columns index will
be *lost*.
.. ipython:: python
@@ -5403,6 +5404,125 @@ The above example creates a partitioned dataset that may look like:
except OSError:
pass
+.. _io.iceberg:
+
+Iceberg
+-------
+
+.. versionadded:: 3.0.0
+
+Apache Iceberg is a high performance open-source format for large analytic tables.
+Iceberg enables the use of SQL tables for big data while making it possible for different
+engines to safely work with the same tables at the same time.
+
+Iceberg support predicate pushdown and column pruning, which are available to pandas
+users via the ``row_filter`` and ``selected_fields`` parameters of the :func:`~pandas.read_iceberg`
+function. This is convenient to extract from large tables a subset that fits in memory as a
+pandas ``DataFrame``.
+
+Internally, pandas uses PyIceberg_ to query Iceberg.
+
+.. _PyIceberg: https://py.iceberg.apache.org/
+
+A simple example loading all data from an Iceberg table ``my_table`` defined in the
+``my_catalog`` catalog.
+
+.. code-block:: python
+
+ df = pd.read_iceberg("my_table", catalog_name="my_catalog")
+
+Catalogs must be defined in the ``.pyiceberg.yaml`` file, usually in the home directory.
+It is possible to to change properties of the catalog definition with the
+``catalog_properties`` parameter:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ catalog_properties={"s3.secret-access-key": "my_secret"},
+ )
+
+It is also possible to fully specify the catalog in ``catalog_properties`` and not provide
+a ``catalog_name``:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_properties={
+ "uri": "http://127.0.0.1:8181",
+ "s3.endpoint": "http://127.0.0.1:9000",
+ },
+ )
+
+To create the ``DataFrame`` with only a subset of the columns:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ selected_fields=["my_column_3", "my_column_7"]
+ )
+
+This will execute the function faster, since other columns won't be read. And it will also
+save memory, since the data from other columns won't be loaded into the underlying memory of
+the ``DataFrame``.
+
+To fetch only a subset of the rows, we can do it with the ``limit`` parameter:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ limit=100,
+ )
+
+This will create a ``DataFrame`` with 100 rows, assuming there are at least this number in
+the table.
+
+To fetch a subset of the rows based on a condition, this can be done using the ``row_filter``
+parameter:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ row_filter="distance > 10.0",
+ )
+
+Reading a particular snapshot is also possible providing the snapshot ID as an argument to
+``snapshot_id``.
+
+To save a ``DataFrame`` to Iceberg, it can be done with the :meth:`DataFrame.to_iceberg`
+method:
+
+.. code-block:: python
+
+ df.to_iceberg("my_table", catalog_name="my_catalog")
+
+To specify the catalog, it works in the same way as for :func:`read_iceberg` with the
+``catalog_name`` and ``catalog_properties`` parameters.
+
+The location of the table can be specified with the ``location`` parameter:
+
+.. code-block:: python
+
+ df.to_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ location="s://my-data-lake/my-iceberg-tables",
+ )
+
+It is possible to add properties to the table snapshot by passing a dictionary to the
+``snapshot_properties`` parameter.
+
+More information about the Iceberg format can be found in the `Apache Iceberg official
+page `__.
+
.. _io.orc:
ORC
diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
index 60a66f5e6f2a8..af377dd7a32f2 100644
--- a/doc/source/user_guide/merging.rst
+++ b/doc/source/user_guide/merging.rst
@@ -107,7 +107,7 @@ Joining logic of the resulting axis
The ``join`` keyword specifies how to handle axis values that don't exist in the first
:class:`DataFrame`.
-``join='outer'`` takes the union of all axis values
+``join='outer'`` takes the union of all axis values.
.. ipython:: python
@@ -130,7 +130,7 @@ The ``join`` keyword specifies how to handle axis values that don't exist in the
p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False);
plt.close("all");
-``join='inner'`` takes the intersection of the axis values
+``join='inner'`` takes the intersection of the axis values.
.. ipython:: python
@@ -296,7 +296,7 @@ the index of the :class:`DataFrame` pieces:
result.index.levels
-``levels`` argument allows specifying resulting levels associated with the ``keys``
+``levels`` argument allows specifying resulting levels associated with the ``keys``.
.. ipython:: python
@@ -322,7 +322,7 @@ Appending rows to a :class:`DataFrame`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If you have a :class:`Series` that you want to append as a single row to a :class:`DataFrame`, you can convert the row into a
-:class:`DataFrame` and use :func:`concat`
+:class:`DataFrame` and use :func:`concat`.
.. ipython:: python
@@ -355,7 +355,7 @@ Merge types
their indexes which must contain unique values.
* **many-to-one**: joining a unique index to one or
more columns in a different :class:`DataFrame`.
-* **many-to-many** : joining columns on columns.
+* **many-to-many**: joining columns on columns.
.. note::
@@ -485,8 +485,9 @@ either the left or right tables, the values in the joined table will be
plt.close("all");
You can merge :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of
-the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. Transform
-the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` before merging
+the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. You can also
+transform the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index`
+before merging:
.. ipython:: python
@@ -504,7 +505,7 @@ the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` bef
pd.merge(df, ser.reset_index(), on=["Let", "Num"])
-Performing an outer join with duplicate join keys in :class:`DataFrame`
+Performing an outer join with duplicate join keys in :class:`DataFrame`:
.. ipython:: python
@@ -956,7 +957,7 @@ location.
:func:`merge_ordered`
---------------------
-:func:`merge_ordered` combines order data such as numeric or time series data
+:func:`merge_ordered` combines ordered data such as numeric or time series data
with optional filling of missing data with ``fill_method``.
.. ipython:: python
@@ -1082,7 +1083,7 @@ Stack the differences on rows.
df.compare(df2, align_axis=0)
-Keep all original rows and columns with ``keep_shape=True``
+Keep all original rows and columns with ``keep_shape=True``.
.. ipython:: python
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index e15939eb49239..56f4c80cbde16 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -258,9 +258,6 @@ will convert your data to use the nullable data types supporting :class:`NA`,
such as :class:`Int64Dtype` or :class:`ArrowDtype`. This is especially helpful after reading
in data sets from IO methods where data types were inferred.
-In this example, while the dtypes of all columns are changed, we show the results for
-the first 10 columns.
-
.. ipython:: python
import io
@@ -434,7 +431,7 @@ where the index and column aligns between the original object and the filled obj
.. note::
- :meth:`DataFrame.where` can also be used to fill NA values.Same result as above.
+ :meth:`DataFrame.where` can also be used to fill NA values. Same result as above.
.. ipython:: python
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
index 8c5e98791a9ef..bc5a2d5ed5735 100644
--- a/doc/source/user_guide/reshaping.rst
+++ b/doc/source/user_guide/reshaping.rst
@@ -395,7 +395,7 @@ variables and the values representing the presence of those variables per row.
pd.get_dummies(df["key"])
df["key"].str.get_dummies()
-``prefix`` adds a prefix to the the column names which is useful for merging the result
+``prefix`` adds a prefix to the column names which is useful for merging the result
with the original :class:`DataFrame`:
.. ipython:: python
diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst
index 25bcb8bcc0c93..624086f7a8505 100644
--- a/doc/source/user_guide/sparse.rst
+++ b/doc/source/user_guide/sparse.rst
@@ -40,8 +40,8 @@ and in the Python interpreter.
.. ipython:: python
- 'dense : {:0.2f} bytes'.format(df.memory_usage().sum() / 1e3)
- 'sparse: {:0.2f} bytes'.format(sdf.memory_usage().sum() / 1e3)
+ f'dense: {df.memory_usage().sum()} bytes'
+ f'sparse: {sdf.memory_usage().sum()} bytes'
Functionally, their behavior should be nearly
identical to their dense counterparts.
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index 10260cb011d90..ac0fc9e53ee94 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -2458,7 +2458,7 @@ you can use the ``tz_convert`` method.
For ``pytz`` time zones, it is incorrect to pass a time zone object directly into
the ``datetime.datetime`` constructor
- (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``.
+ (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``).
Instead, the datetime needs to be localized using the ``localize`` method
on the ``pytz`` time zone object.
diff --git a/doc/source/user_guide/user_defined_functions.rst b/doc/source/user_guide/user_defined_functions.rst
new file mode 100644
index 0000000000000..6f7fdaddac622
--- /dev/null
+++ b/doc/source/user_guide/user_defined_functions.rst
@@ -0,0 +1,419 @@
+.. _udf:
+
+{{ header }}
+
+*****************************
+User-Defined Functions (UDFs)
+*****************************
+
+In pandas, User-Defined Functions (UDFs) provide a way to extend the library’s
+functionality by allowing users to apply custom computations to their data. While
+pandas comes with a set of built-in functions for data manipulation, UDFs offer
+flexibility when built-in methods are not sufficient. These functions can be
+applied at different levels: element-wise, row-wise, column-wise, or group-wise,
+and behave differently, depending on the method used.
+
+Here’s a simple example to illustrate a UDF applied to a Series:
+
+.. ipython:: python
+
+ s = pd.Series([1, 2, 3])
+
+ # Simple UDF that adds 1 to a value
+ def add_one(x):
+ return x + 1
+
+ # Apply the function element-wise using .map
+ s.map(add_one)
+
+Why Not To Use User-Defined Functions
+-------------------------------------
+
+While UDFs provide flexibility, they come with significant drawbacks, primarily
+related to performance and behavior. When using UDFs, pandas must perform inference
+on the result, and that inference could be incorrect. Furthermore, unlike vectorized operations,
+UDFs are slower because pandas can't optimize their computations, leading to
+inefficient processing.
+
+.. note::
+ In general, most tasks can and should be accomplished using pandas’ built-in methods or vectorized operations.
+
+Despite their drawbacks, UDFs can be helpful when:
+
+* **Custom Computations Are Needed**: Implementing complex logic or domain-specific calculations that pandas'
+ built-in methods cannot handle.
+* **Extending pandas' Functionality**: Applying external libraries or specialized algorithms unavailable in pandas.
+* **Handling Complex Grouped Operations**: Performing operations on grouped data that standard methods do not support.
+
+For example:
+
+.. code-block:: python
+
+ from sklearn.linear_model import LinearRegression
+
+ # Sample data
+ df = pd.DataFrame({
+ 'group': ['A', 'A', 'A', 'B', 'B', 'B'],
+ 'x': [1, 2, 3, 1, 2, 3],
+ 'y': [2, 4, 6, 1, 2, 1.5]
+ })
+
+ # Function to fit a model to each group
+ def fit_model(group):
+ model = LinearRegression()
+ model.fit(group[['x']], group['y'])
+ group['y_pred'] = model.predict(group[['x']])
+ return group
+
+ result = df.groupby('group').apply(fit_model)
+
+
+Methods that support User-Defined Functions
+-------------------------------------------
+
+User-Defined Functions can be applied across various pandas methods:
+
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| Method | Function Input | Function Output | Description |
++===============================+========================+==========================+==============================================================================================================================================+
+| :ref:`udf.map` | Scalar | Scalar | Apply a function to each element |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.apply` (axis=0) | Column (Series) | Column (Series) | Apply a function to each column |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.apply` (axis=1) | Row (Series) | Row (Series) | Apply a function to each row |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.pipe` | Series or DataFrame | Series or DataFrame | Chain functions together to apply to Series or Dataframe |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.filter` | Series or DataFrame | Boolean | Only accepts UDFs in group by. Function is called for each group, and the group is removed from the result if the function returns ``False`` |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.agg` | Series or DataFrame | Scalar or Series | Aggregate and summarizes values, e.g., sum or custom reducer |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.transform` (axis=0) | Column (Series) | Column (Series) | Same as :meth:`apply` with (axis=0), but it raises an exception if the function changes the shape of the data |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.transform` (axis=1) | Row (Series) | Row (Series) | Same as :meth:`apply` with (axis=1), but it raises an exception if the function changes the shape of the data |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+
+When applying UDFs in pandas, it is essential to select the appropriate method based
+on your specific task. Each method has its strengths and is designed for different use
+cases. Understanding the purpose and behavior of each method will help you make informed
+decisions, ensuring more efficient and maintainable code.
+
+.. note::
+ Some of these methods are can also be applied to groupby, resample, and various window objects.
+ See :ref:`groupby`, :ref:`resample()`, :ref:`rolling()`, :ref:`expanding()`,
+ and :ref:`ewm()` for details.
+
+
+.. _udf.map:
+
+:meth:`Series.map` and :meth:`DataFrame.map`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :meth:`map` method is used specifically to apply element-wise UDFs. This means the function
+will be called for each element in the ``Series`` or ``DataFrame``, with the individual value or
+the cell as the function argument.
+
+.. ipython:: python
+
+ temperature_celsius = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def to_fahrenheit(value):
+ return value * (9 / 5) + 32
+
+ temperature_celsius.map(to_fahrenheit)
+
+In this example, the function ``to_fahrenheit`` will be called 6 times, once for each value
+in the ``DataFrame``. And the result of each call will be returned in the corresponding cell
+of the resulting ``DataFrame``.
+
+In general, ``map`` will be slow, as it will not make use of vectorization. Instead, a Python
+function call for each value will be required, which will slow down things significantly if
+working with medium or large data.
+
+When to use: Use :meth:`map` for applying element-wise UDFs to DataFrames or Series.
+
+.. _udf.apply:
+
+:meth:`Series.apply` and :meth:`DataFrame.apply`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :meth:`apply` method allows you to apply UDFs for a whole column or row. This is different
+from :meth:`map` in that the function will be called for each column (or row), not for each individual value.
+
+.. ipython:: python
+
+ temperature_celsius = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def to_fahrenheit(column):
+ return column * (9 / 5) + 32
+
+ temperature_celsius.apply(to_fahrenheit)
+
+In the example, ``to_fahrenheit`` will be called only twice, as opposed to the 6 times with :meth:`map`.
+This will be faster than using :meth:`map`, since the operations for each column are vectorized, and the
+overhead of iterating over data in Python and calling Python functions is significantly reduced.
+
+In some cases, the function may require all the data to be able to compute the result. So :meth:`apply`
+is needed, since with :meth:`map` the function can only access one element at a time.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def normalize(column):
+ return column / column.mean()
+
+ temperature.apply(normalize)
+
+In the example, the ``normalize`` function needs to compute the mean of the whole column in order
+to divide each element by it. So, we cannot call the function for each element, but we need the
+function to receive the whole column.
+
+:meth:`apply` can also execute function by row, by specifying ``axis=1``.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def hotter(row):
+ return row["Los Angeles"] - row["NYC"]
+
+ temperature.apply(hotter, axis=1)
+
+In the example, the function ``hotter`` will be called 3 times, once for each row. And each
+call will receive the whole row as the argument, allowing computations that require more than
+one value in the row.
+
+``apply`` is also available for :meth:`SeriesGroupBy.apply`, :meth:`DataFrameGroupBy.apply`,
+:meth:`Rolling.apply`, :meth:`Expanding.apply` and :meth:`Resampler.apply`. You can read more
+about ``apply`` in groupby operations :ref:`groupby.apply`.
+
+When to use: :meth:`apply` is suitable when no alternative vectorized method or UDF method is available,
+but consider optimizing performance with vectorized operations wherever possible.
+
+.. _udf.pipe:
+
+:meth:`Series.pipe` and :meth:`DataFrame.pipe`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``pipe`` method is similar to ``map`` and ``apply``, but the function receives the whole ``Series``
+or ``DataFrame`` it is called on.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def normalize(df):
+ return df / df.mean().mean()
+
+ temperature.pipe(normalize)
+
+This is equivalent to calling the ``normalize`` function with the ``DataFrame`` as the parameter.
+
+.. ipython:: python
+
+ normalize(temperature)
+
+The main advantage of using ``pipe`` is readability. It allows method chaining and clearer code when
+calling multiple functions.
+
+.. ipython:: python
+
+ temperature_celsius = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def multiply_by_9(value):
+ return value * 9
+
+ def divide_by_5(value):
+ return value / 5
+
+ def add_32(value):
+ return value + 32
+
+ # Without `pipe`:
+ fahrenheit = add_32(divide_by_5(multiply_by_9(temperature_celsius)))
+
+ # With `pipe`:
+ fahrenheit = (temperature_celsius.pipe(multiply_by_9)
+ .pipe(divide_by_5)
+ .pipe(add_32))
+
+``pipe`` is also available for :meth:`SeriesGroupBy.pipe`, :meth:`DataFrameGroupBy.pipe` and
+:meth:`Resampler.pipe`. You can read more about ``pipe`` in groupby operations in :ref:`groupby.pipe`.
+
+When to use: Use :meth:`pipe` when you need to create a pipeline of operations and want to keep the code readable and maintainable.
+
+.. _udf.filter:
+
+:meth:`Series.filter` and :meth:`DataFrame.filter`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``filter`` method is used to select a subset of rows that match certain criteria.
+:meth:`Series.filter` and :meth:`DataFrame.filter` do not support user defined functions,
+but :meth:`SeriesGroupBy.filter` and :meth:`DataFrameGroupBy.filter` do. You can read more
+about ``filter`` in groupby operations in :ref:`groupby.filter`.
+
+.. _udf.agg:
+
+:meth:`Series.agg` and :meth:`DataFrame.agg`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``agg`` method is used to aggregate a set of data points into a single one.
+The most common aggregation functions such as ``min``, ``max``, ``mean``, ``sum``, etc.
+are already implemented in pandas. ``agg`` allows to implement other custom aggregate
+functions.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def highest_jump(column):
+ return column.pct_change().max()
+
+ temperature.agg(highest_jump)
+
+
+When to use: Use :meth:`agg` for performing custom aggregations, where the operation returns
+a scalar value on each input.
+
+.. _udf.transform:
+
+:meth:`Series.transform` and :meth:`DataFrame.transform`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``transform``` method is similar to an aggregation, with the difference that the result is broadcasted
+to the original data.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31]},
+ index=pd.date_range("2000-01-01", "2000-01-03"))
+
+ def warm_up_all_days(column):
+ return pd.Series(column.max(), index=column.index)
+
+ temperature.transform(warm_up_all_days)
+
+In the example, the ``warm_up_all_days`` function computes the ``max`` like an aggregation, but instead
+of returning just the maximum value, it returns a ``DataFrame`` with the same shape as the original one
+with the values of each day replaced by the maximum temperature of the city.
+
+``transform`` is also available for :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.transform` and
+:meth:`Resampler.transform`, where it's more common. You can read more about ``transform`` in groupby
+operations in :ref:`groupby.transform`.
+
+When to use: When you need to perform an aggregation that will be returned in the original structure of
+the DataFrame.
+
+
+Performance
+-----------
+
+While UDFs provide flexibility, their use is generally discouraged as they can introduce
+performance issues, especially when written in pure Python. To improve efficiency,
+consider using built-in ``NumPy`` or ``pandas`` functions instead of UDFs
+for common operations.
+
+.. note::
+ If performance is critical, explore **vectorized operations** before resorting
+ to UDFs.
+
+Vectorized Operations
+~~~~~~~~~~~~~~~~~~~~~
+
+Below is a comparison of using UDFs versus using Vectorized Operations:
+
+.. code-block:: python
+
+ # User-defined function
+ def calc_ratio(row):
+ return 100 * (row["one"] / row["two"])
+
+ df["new_col"] = df.apply(calc_ratio, axis=1)
+
+ # Vectorized Operation
+ df["new_col2"] = 100 * (df["one"] / df["two"])
+
+Measuring how long each operation takes:
+
+.. code-block:: text
+
+ User-defined function: 5.6435 secs
+ Vectorized: 0.0043 secs
+
+Vectorized operations in pandas are significantly faster than using :meth:`DataFrame.apply`
+with UDFs because they leverage highly optimized C functions
+via ``NumPy`` to process entire arrays at once. This approach avoids the overhead of looping
+through rows in Python and making separate function calls for each row, which is slow and
+inefficient. Additionally, ``NumPy`` arrays benefit from memory efficiency and CPU-level
+optimizations, making vectorized operations the preferred choice whenever possible.
+
+
+Improving Performance with UDFs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In scenarios where UDFs are necessary, there are still ways to mitigate their performance drawbacks.
+One approach is to use **Numba**, a Just-In-Time (JIT) compiler that can significantly speed up numerical
+Python code by compiling Python functions to optimized machine code at runtime.
+
+By annotating your UDFs with ``@numba.jit``, you can achieve performance closer to vectorized operations,
+especially for computationally heavy tasks.
+
+.. note::
+ You may also refer to the user guide on `Enhancing performance `_
+ for a more detailed guide to using **Numba**.
+
+Using :meth:`DataFrame.pipe` for Composable Logic
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Another useful pattern for improving readability and composability, especially when mixing
+vectorized logic with UDFs, is to use the :meth:`DataFrame.pipe` method.
+
+:meth:`DataFrame.pipe` doesn't improve performance directly, but it enables cleaner
+method chaining by passing the entire object into a function. This is especially helpful
+when chaining custom transformations:
+
+.. code-block:: python
+
+ def add_ratio_column(df):
+ df["ratio"] = 100 * (df["one"] / df["two"])
+ return df
+
+ df = (
+ df
+ .query("one > 0")
+ .pipe(add_ratio_column)
+ .dropna()
+ )
+
+This is functionally equivalent to calling ``add_ratio_column(df)``, but keeps your code
+clean and composable. The function you pass to :meth:`DataFrame.pipe` can use vectorized operations,
+row-wise UDFs, or any other logic; :meth:`DataFrame.pipe` is agnostic.
+
+.. note::
+ While :meth:`DataFrame.pipe` does not improve performance on its own,
+ it promotes clean, modular design and allows both vectorized and UDF-based logic
+ to be composed in method chains.
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 1dd6c5fabef04..9da73c8fd76d4 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -24,6 +24,7 @@ Version 2.3
.. toctree::
:maxdepth: 2
+ v2.3.1
v2.3.0
Version 2.2
diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst
index dcb0d3229aa5d..903632b488cca 100644
--- a/doc/source/whatsnew/v0.11.0.rst
+++ b/doc/source/whatsnew/v0.11.0.rst
@@ -70,14 +70,14 @@ See the section :ref:`Selection by Position ` for substitutes.
Dtypes
~~~~~~
-Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste.
+Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``), then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste.
.. ipython:: python
- df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')
+ df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float64')
df1
df1.dtypes
- df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'),
+ df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float32'),
'B': pd.Series(np.random.randn(8)),
'C': pd.Series(range(8), dtype='uint8')})
df2
diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
index 08d3a6b188322..f2674938e7726 100644
--- a/doc/source/whatsnew/v0.12.0.rst
+++ b/doc/source/whatsnew/v0.12.0.rst
@@ -245,7 +245,7 @@ IO enhancements
format. (:issue:`3571`, :issue:`1651`, :issue:`3141`)
- If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
- with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will
+ with ``df.to_csv(..., index=False)``), then any ``names`` on the columns index will
be *lost*.
.. ipython:: python
diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst
index cbf5b7703bd79..b376530358f53 100644
--- a/doc/source/whatsnew/v0.16.1.rst
+++ b/doc/source/whatsnew/v0.16.1.rst
@@ -353,7 +353,7 @@ Deprecations
Index representation
~~~~~~~~~~~~~~~~~~~~
-The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``; if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`)
+The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``); if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`)
Previous behavior
diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index 1ae711113773f..0b1f6a2249a6c 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -1547,7 +1547,7 @@ Bug fixes
- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)
- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`)
- Bug ``Series.isnull()`` and ``Series.notnull()`` ignore ``Period('NaT')`` (:issue:`13737`)
-- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737`
+- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737`)
- Bug in ``.fillna(value=np.nan)`` incorrectly raises ``KeyError`` on a ``category`` dtyped ``Series`` (:issue:`14021`)
- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`)
- Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`)
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 60e77a8c5d8c5..0f40f5bfa5fc9 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -1019,7 +1019,7 @@ operations has been changed to match the arithmetic operations in these cases.
The affected cases are:
- operating against a 2-dimensional ``np.ndarray`` with either 1 row or 1 column will now broadcast the same way a ``np.ndarray`` would (:issue:`23000`).
-- a list or tuple with length matching the number of rows in the :class:`DataFrame` will now raise ``ValueError`` instead of operating column-by-column (:issue:`22880`.
+- a list or tuple with length matching the number of rows in the :class:`DataFrame` will now raise ``ValueError`` instead of operating column-by-column (:issue:`22880`).
- a list or tuple with length matching the number of columns in the :class:`DataFrame` will now operate row-by-row instead of raising ``ValueError`` (:issue:`22880`).
.. ipython:: python
@@ -1556,7 +1556,7 @@ Performance improvements
(i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains`
is likewise much faster (:issue:`21369`, :issue:`21508`)
- Improved performance of :meth:`HDFStore.groups` (and dependent functions like
- :meth:`HDFStore.keys`. (i.e. ``x in store`` checks are much faster)
+ :meth:`HDFStore.keys` (i.e. ``x in store`` checks) are much faster)
(:issue:`21372`)
- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 98cb9c4ad7b45..1aac68b90ff2f 100755
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -1114,7 +1114,7 @@ Numeric
- Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`)
- Bug in :class:`NumericIndex` construction that caused indexing to fail when integers in the ``np.uint64`` range were used (:issue:`28023`)
- Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`)
-- Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`)
+- Bug in :meth:`Series.interpolate` when using ``method='index'`` with an unsorted index, would previously return incorrect results. (:issue:`21037`)
- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
- Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`)
- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index b199b113d26f2..dff73bef79135 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -1039,7 +1039,7 @@ Missing
^^^^^^^
- Calling :meth:`fillna` on an empty :class:`Series` now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`).
- Bug in :meth:`Series.replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`)
-- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable Boolean dtype and with ``skipna=False`` (:issue:`33253`)
+- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nullable Boolean dtype and with ``skipna=False`` (:issue:`33253`)
- Clarified documentation on interpolate with ``method=akima``. The ``der`` parameter must be scalar or ``None`` (:issue:`33426`)
- :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`DataFrame.fillna` (:issue:`12918`, :issue:`29146`)
- Bug in :meth:`DataFrame.interpolate` when called on a :class:`DataFrame` with column names of string type was throwing a ValueError. The method is now independent of the type of the column names (:issue:`33956`)
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 12ab4f27d1e62..ebde7cb14684b 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -793,7 +793,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`)
- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`)
- Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`)
-- Bug when subsetting columns on a :class:`.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`)
+- Bug when subsetting columns on a :class:`.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']]``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`)
- Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`)
- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`)
- Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`)
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 7b1aef07e5f00..cf016c882c225 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -666,7 +666,7 @@ be removed in a future version. Use :func:`pandas.concat` instead (:issue:`35407
.. code-block:: ipython
- In [1]: pd.Series([1, 2]).append(pd.Series([3, 4])
+ In [1]: pd.Series([1, 2]).append(pd.Series([3, 4]))
Out [1]:
:1: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
0 1
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 43aa63c284f38..0bede60758331 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -287,7 +287,7 @@ and attributes without holding entire tree in memory (:issue:`45442`).
In [1]: df = pd.read_xml(
... "/path/to/downloaded/enwikisource-latest-pages-articles.xml",
- ... iterparse = {"page": ["title", "ns", "id"]})
+ ... iterparse = {"page": ["title", "ns", "id"]}
... )
df
Out[2]:
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index 329ef2859f56f..e32417e367427 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -815,8 +815,8 @@ Conversion
^^^^^^^^^^
- Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`)
- Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`)
+- Bug in :meth:`DataFrame.loc` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`)
- Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`)
-- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`)
Strings
^^^^^^^
@@ -826,7 +826,7 @@ Strings
- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`)
- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`)
- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`)
-- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`)
+- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string())`` allows partial matches when regex ends in literal //$ (:issue:`56652`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`)
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`)
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 230332319e0ac..6433fe8d2b060 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -1,6 +1,6 @@
.. _whatsnew_230:
-What's new in 2.3.0 (Month XX, 2024)
+What's new in 2.3.0 (June 4, 2025)
------------------------------------
These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog
@@ -10,37 +10,26 @@ including other versions of pandas.
.. ---------------------------------------------------------------------------
-.. _whatsnew_230.upcoming_changes:
-
-Upcoming changes in pandas 3.0
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-
.. _whatsnew_230.enhancements:
Enhancements
~~~~~~~~~~~~
-.. _whatsnew_230.enhancements.enhancement1:
-
-enhancement1
-^^^^^^^^^^^^
-
-
.. _whatsnew_230.enhancements.other:
Other enhancements
^^^^^^^^^^^^^^^^^^
+- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
updated to work correctly with NumPy >= 2 (:issue:`57739`)
-- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
-- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
+- :meth:`Series.str.decode` result now has :class:`StringDtype` when ``future.infer_string`` is True (:issue:`60709`)
+- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with :class:`StringDtype` (:issue:`60663`)
- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`)
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
-- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
-- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for :class:`StringDtype` columns (:issue:`60633`)
+- The :meth:`~Series.sum` reduction is now implemented for :class:`StringDtype` columns (:issue:`59853`)
.. ---------------------------------------------------------------------------
.. _whatsnew_230.notable_bug_fixes:
@@ -50,19 +39,29 @@ Notable bug fixes
These are bug fixes that might have notable behavior changes.
-.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1:
+.. _whatsnew_230.notable_bug_fixes.string_comparisons:
+
+Comparisons between different string dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-notable_bug_fix1
-^^^^^^^^^^^^^^^^
+In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy
+
+ object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA)
+
+in determining the result dtype when there are different string dtypes compared. Some examples:
+
+- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``.
+- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
+- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
.. _whatsnew_230.api_changes:
API changes
~~~~~~~~~~~
-- When enabling the ``future.infer_string`` option: Index set operations (like
- union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
- empty ``Index`` with object dtype when determining the dtype of the resulting
+- When enabling the ``future.infer_string`` option, :class:`Index` set operations (like
+ union or intersection) will now ignore the dtype of an empty :class:`RangeIndex` or
+ empty :class:`Index` with ``object`` dtype when determining the dtype of the resulting
Index (:issue:`60797`)
.. ---------------------------------------------------------------------------
@@ -73,119 +72,35 @@ Deprecations
- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`)
- Deprecated the ``"pyarrow_numpy"`` storage option for :class:`StringDtype` (:issue:`60152`)
-.. ---------------------------------------------------------------------------
-.. _whatsnew_230.performance:
-
-Performance improvements
-~~~~~~~~~~~~~~~~~~~~~~~~
--
--
-
.. ---------------------------------------------------------------------------
.. _whatsnew_230.bug_fixes:
Bug fixes
~~~~~~~~~
-Categorical
-^^^^^^^^^^^
--
--
-
-Datetimelike
-^^^^^^^^^^^^
--
--
-
-Timedelta
-^^^^^^^^^
--
--
-
-Timezones
-^^^^^^^^^
--
--
-
Numeric
^^^^^^^
-- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`)
--
-
-Conversion
-^^^^^^^^^^
--
--
+- Bug in :meth:`Series.mode` and :meth:`DataFrame.mode` with ``dropna=False`` where not all dtypes would sort in the presence of ``NA`` values (:issue:`60702`)
+- Bug in :meth:`Series.round` where a ``TypeError`` would always raise with ``object`` dtype (:issue:`61206`)
Strings
^^^^^^^
-- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
-- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
-- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
+- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`)
+- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`)
+- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` where an ``Exception`` was not raised for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
+- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` that incorrectly returned integer results with ``method="average"`` and raised an error if it would truncate results (:issue:`59768`)
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
+- Bug in :meth:`Series.str.center` with :class:`StringDtype` with ``storage="pyarrow"`` not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
-- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
-- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
-
-Interval
-^^^^^^^^
--
--
+- Bug in :meth:`Series.str.slice` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
Indexing
^^^^^^^^
-- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
--
-
-Missing
-^^^^^^^
--
--
-
-MultiIndex
-^^^^^^^^^^
--
--
+- Bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
I/O
^^^
-- :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`)
--
-
-Period
-^^^^^^
--
--
-
-Plotting
-^^^^^^^^
--
--
-
-Groupby/resample/rolling
-^^^^^^^^^^^^^^^^^^^^^^^^
--
--
-
-Reshaping
-^^^^^^^^^
--
--
-
-Sparse
-^^^^^^
--
--
-
-ExtensionArray
-^^^^^^^^^^^^^^
--
--
-
-Styler
-^^^^^^
--
--
+- Bug in :meth:`DataFrame.to_excel` which stored decimals as strings instead of numbers (:issue:`49598`)
Other
^^^^^
@@ -197,3 +112,5 @@ Other
Contributors
~~~~~~~~~~~~
+
+.. contributors:: v2.2.3..v2.3.0|HEAD
diff --git a/doc/source/whatsnew/v2.3.1.rst b/doc/source/whatsnew/v2.3.1.rst
new file mode 100644
index 0000000000000..c9d8f04250c23
--- /dev/null
+++ b/doc/source/whatsnew/v2.3.1.rst
@@ -0,0 +1,42 @@
+.. _whatsnew_231:
+
+What's new in 2.3.1 (Month XX, 2025)
+------------------------------------
+
+These are the changes in pandas 2.3.1. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_231.enhancements:
+
+Enhancements
+~~~~~~~~~~~~
+-
+
+.. _whatsnew_231.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_231.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_231.other:
+
+Other
+~~~~~
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_231.contributors:
+
+Contributors
+~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 2b437734a451a..8d3ac0e396430 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -30,7 +30,6 @@ Other enhancements
^^^^^^^^^^^^^^^^^^
- :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`)
- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
-- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
@@ -52,6 +51,7 @@ Other enhancements
- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`)
- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`)
- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
+- :func:`set_option` now accepts a dictionary of options, simplifying configuration of multiple settings at once (:issue:`61093`)
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)
@@ -61,21 +61,28 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering.
- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
+- :class:`Easter` has gained a new constructor argument ``method`` which specifies the method used to calculate Easter — for example, Orthodox Easter (:issue:`61665`)
+- :class:`Holiday` constructor argument ``days_of_week`` will raise a ``ValueError`` when type is something other than ``None`` or ``tuple`` (:issue:`61658`)
+- :class:`Holiday` has gained the constructor argument and field ``exclude_dates`` to exclude specific datetimes from a custom holiday calendar (:issue:`54382`)
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`)
+- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
+- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
- Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
- Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`)
+- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
- Improved deprecation message for offset aliases (:issue:`60820`)
@@ -313,12 +320,40 @@ Optional libraries below the lowest tested version may still work, but are not c
+========================+=====================+
| pytz | 2023.4 |
+------------------------+---------------------+
-| fastparquet | 2023.10.0 |
+| fastparquet | 2024.2.0 |
+------------------------+---------------------+
| adbc-driver-postgresql | 0.10.0 |
+------------------------+---------------------+
| mypy (dev) | 1.9.0 |
+------------------------+---------------------+
+| beautifulsoup4 | 4.12.3 |
++------------------------+---------------------+
+| fsspec | 2024.2.0 |
++------------------------+---------------------+
+| gcsfs | 2024.2.0 |
++------------------------+---------------------+
+| s3fs | 2024.2.0 |
++------------------------+---------------------+
+| Jinja2 | 3.1.3 |
++------------------------+---------------------+
+| matplotlib | 3.8.3 |
++------------------------+---------------------+
+| numba | 0.59.0 |
++------------------------+---------------------+
+| numexpr | 2.9.0 |
++------------------------+---------------------+
+| pymysql | 1.1.0 |
++------------------------+---------------------+
+| pyreadstat | 1.2.6 |
++------------------------+---------------------+
+| SciPy | 1.12.0 |
++------------------------+---------------------+
+| xarray | 2024.1.0 |
++------------------------+---------------------+
+| xlsxwriter | 3.2.0 |
++------------------------+---------------------+
+| zstandard | 0.22.0 |
++------------------------+---------------------+
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
@@ -420,6 +455,7 @@ Other Deprecations
- Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`)
- Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`)
- Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`)
+- Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`)
- Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`)
.. ---------------------------------------------------------------------------
@@ -591,6 +627,7 @@ Performance improvements
- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
+- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`)
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
@@ -621,6 +658,7 @@ Performance improvements
- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
- Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`)
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
+- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`)
- Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`)
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
@@ -636,6 +674,7 @@ Bug fixes
Categorical
^^^^^^^^^^^
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
+- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`)
- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`)
-
@@ -648,6 +687,7 @@ Datetimelike
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
+- Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`)
- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
@@ -658,6 +698,7 @@ Datetimelike
- Bug in :meth:`to_datetime` on float array with missing values throwing ``FloatingPointError`` (:issue:`58419`)
- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`)
- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
+- Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`)
- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
@@ -668,13 +709,16 @@ Timedelta
Timezones
^^^^^^^^^
--
+- Bug in :meth:`DatetimeIndex.union`, :meth:`DatetimeIndex.intersection`, and :meth:`DatetimeIndex.symmetric_difference` changing timezone to UTC when merging two DatetimeIndex objects with the same timezone but different units (:issue:`60080`)
-
Numeric
^^^^^^^
- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`)
+- Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`)
- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
+- Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`)
+- Bug in :meth:`Series.std` and :meth:`Series.var` when using complex-valued data (:issue:`61645`)
- Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
Conversion
@@ -732,10 +776,12 @@ I/O
- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`)
+- Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`)
- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
- Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
+- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`)
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
@@ -749,6 +795,7 @@ I/O
- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`)
- Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`)
+- Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`)
- Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`)
Period
@@ -759,22 +806,30 @@ Period
Plotting
^^^^^^^^
- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`)
+- Bug in :meth:`DataFrame.plot.bar` when ``subplots`` and ``stacked=True`` are used in conjunction which causes incorrect stacking. (:issue:`61018`)
- Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`)
- Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`)
- Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`)
+- Bug in :meth:`DataFrame.plot` where ``title`` would require extra titles when plotting more than one column per subplot. (:issue:`61019`)
+- Bug in :meth:`Series.plot` preventing a line and bar from being aligned on the same plot (:issue:`61161`)
+- Bug in :meth:`Series.plot` preventing a line and scatter plot from being aligned (:issue:`61005`)
- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`)
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`)
- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`)
+- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`)
- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`)
- Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
+- Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`)
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
+- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`)
- Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`)
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
+- Bug in :meth:`DataFrameGroupBy.agg` where applying a user-defined function to an empty DataFrame returned a Series instead of an empty DataFrame. (:issue:`61503`)
- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`)
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
@@ -783,7 +838,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
-- Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`)
+- Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`)
Reshaping
^^^^^^^^^
@@ -796,9 +851,12 @@ Reshaping
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
- Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`)
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
+- Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`)
- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)
+- Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`)
+- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`)
Sparse
^^^^^^
@@ -825,17 +883,21 @@ Other
- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
- Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`)
- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`)
+- Bug in :func:`eval` where method calls on binary operations like ``(x + y).dropna()`` would raise ``AttributeError: 'BinOp' object has no attribute 'value'`` (:issue:`61175`)
- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
- Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`)
- Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`)
- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
+- Bug in :meth:`DataFrame.apply` raising ``RecursionError`` when passing ``func=list[int]``. (:issue:`61565`)
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
- Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`)
- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`)
+- Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`)
- Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`)
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
+- Bug in :meth:`DataFrame.sort_values` where sorting by a column explicitly named ``None`` raised a ``KeyError`` instead of sorting by the column as expected. (:issue:`61512`)
- Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
@@ -855,6 +917,7 @@ Other
- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
+- Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`)
- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`)
.. ***DO NOT USE THIS SECTION***
diff --git a/environment.yml b/environment.yml
index a8c8b20e20fe4..74186bd2581c4 100644
--- a/environment.yml
+++ b/environment.yml
@@ -3,12 +3,12 @@ name: pandas-dev
channels:
- conda-forge
dependencies:
- - python=3.10
+ - python=3.11
- pip
# build dependencies
- versioneer
- - cython~=3.0.5
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -23,43 +23,42 @@ dependencies:
# required dependencies
- python-dateutil
- - numpy<2
+ - numpy<3
# optional dependencies
- - beautifulsoup4>=4.11.2
- - blosc
+ - beautifulsoup4>=4.12.3
- bottleneck>=1.3.6
- - fastparquet>=2023.10.0
- - fsspec>=2022.11.0
+ - fastparquet>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2022.11.0
+ - gcsfs>=2023.12.2
- ipython
- pickleshare # Needed for IPython Sphinx directive in the docs GH#60429
- - jinja2>=3.1.2
+ - jinja2>=3.1.3
- lxml>=4.9.2
- - matplotlib>=3.6.3
- - numba>=0.56.4
- - numexpr>=2.8.4
- - openpyxl>=3.1.0
+ - matplotlib>=3.8.3
+ - numba>=0.59.0
+ - numexpr>=2.9.0
+ - openpyxl>=3.1.2
- odfpy>=1.4.1
- - py
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- - pymysql>=1.0.2
- - pyreadstat>=1.2.0
+ - pyiceberg>=0.7.1
+ - pymysql>=1.1.0
+ - pyreadstat>=1.2.6
- pytables>=3.8.0
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2022.11.0
- - scipy>=1.10.0
+ - s3fs>=2023.12.2
+ - scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2022.12.0, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- - xlsxwriter>=3.0.5
- - zstandard>=0.19.0
+ - xlsxwriter>=3.2.0
+ - zstandard>=0.22.0
# downstream packages
- dask-core
@@ -80,12 +79,10 @@ dependencies:
- flake8=7.1.0 # run in subprocess over docstring examples
- mypy=1.13.0 # pre-commit uses locally installed mypy
- tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py
- - pre-commit>=4.0.1
+ - pre-commit>=4.2.0
# documentation
- gitpython # obtain contributors from git for whatsnew
- - gitdb
- - google-auth
- natsort # DataFrame.sort_values doctest
- numpydoc
- pydata-sphinx-theme=0.16
diff --git a/meson.build b/meson.build
index 66583095a6e77..6a00e52481108 100644
--- a/meson.build
+++ b/meson.build
@@ -47,6 +47,28 @@ endif
cy = meson.get_compiler('cython')
if cy.version().version_compare('>=3.1.0')
add_project_arguments('-Xfreethreading_compatible=true', language: 'cython')
+
+ # Use shared utility code to reduce wheel sizes
+ # copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files
+ cy = find_program(cy.cmd_array()[0])
+ cython_shared_src = custom_target(
+ install: false,
+ output: '_cyutility.c',
+ command: [
+ cy,
+ '-3',
+ '-Xfreethreading_compatible=true',
+ '--fast-fail',
+ '--generate-shared=' + meson.current_build_dir() / '_cyutility.c',
+ ],
+ )
+
+ py.extension_module(
+ '_cyutility',
+ cython_shared_src,
+ subdir: 'pandas/_libs',
+ install: true,
+ )
endif
# Needed by pandas.test() when it looks for the pytest ini options
diff --git a/pandas/__init__.py b/pandas/__init__.py
index c570fb8d70204..8b92ad6cdfebb 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -3,20 +3,18 @@
__docformat__ = "restructuredtext"
# Let users know if they're missing any of our hard dependencies
-_hard_dependencies = ("numpy", "dateutil")
-_missing_dependencies = []
+_hard_dependencies = ("numpy", "dateutil", "tzdata")
for _dependency in _hard_dependencies:
try:
__import__(_dependency)
except ImportError as _e: # pragma: no cover
- _missing_dependencies.append(f"{_dependency}: {_e}")
+ raise ImportError(
+ f"Unable to import required dependency {_dependency}. "
+ "Please see the traceback for details."
+ ) from _e
-if _missing_dependencies: # pragma: no cover
- raise ImportError(
- "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies)
- )
-del _hard_dependencies, _dependency, _missing_dependencies
+del _hard_dependencies, _dependency
try:
# numpy compat
@@ -166,6 +164,7 @@
read_stata,
read_sas,
read_spss,
+ read_iceberg,
)
from pandas.io.json._normalize import json_normalize
@@ -321,6 +320,7 @@
"read_fwf",
"read_hdf",
"read_html",
+ "read_iceberg",
"read_json",
"read_orc",
"read_parquet",
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index ce53e05608ba7..d42d90d44f82f 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -199,9 +199,9 @@ def set_option(*args) -> None:
Parameters
----------
- *args : str | object
- Arguments provided in pairs, which will be interpreted as (pattern, value)
- pairs.
+ *args : str | object | dict
+ Arguments provided in pairs, which will be interpreted as (pattern, value),
+ or as a single dictionary containing multiple option-value pairs.
pattern: str
Regexp which should match a single option
value: object
@@ -239,6 +239,8 @@ def set_option(*args) -> None:
Examples
--------
+ Option-Value Pair Input:
+
>>> pd.set_option("display.max_columns", 4)
>>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
>>> df
@@ -247,8 +249,23 @@ def set_option(*args) -> None:
1 6 7 ... 9 10
[2 rows x 5 columns]
>>> pd.reset_option("display.max_columns")
+
+ Dictionary Input:
+
+ >>> pd.set_option({"display.max_columns": 4, "display.precision": 1})
+ >>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+ >>> df
+ 0 1 ... 3 4
+ 0 1 2 ... 4 5
+ 1 6 7 ... 9 10
+ [2 rows x 5 columns]
+ >>> pd.reset_option("display.max_columns")
+ >>> pd.reset_option("display.precision")
"""
- # must at least 1 arg deal with constraints later
+ # Handle dictionary input
+ if len(args) == 1 and isinstance(args[0], dict):
+ args = tuple(kv for item in args[0].items() for kv in item)
+
nargs = len(args)
if not nargs or nargs % 2 != 0:
raise ValueError("Must provide an even number of non-keyword arguments")
@@ -440,9 +457,10 @@ def option_context(*args) -> Generator[None]:
Parameters
----------
- *args : str | object
+ *args : str | object | dict
An even amount of arguments provided in pairs which will be
- interpreted as (pattern, value) pairs.
+ interpreted as (pattern, value) pairs. Alternatively, a single
+ dictionary of {pattern: value} may be provided.
Returns
-------
@@ -471,7 +489,12 @@ def option_context(*args) -> Generator[None]:
>>> from pandas import option_context
>>> with option_context("display.max_rows", 10, "display.max_columns", 5):
... pass
+ >>> with option_context({"display.max_rows": 10, "display.max_columns": 5}):
+ ... pass
"""
+ if len(args) == 1 and isinstance(args[0], dict):
+ args = tuple(kv for item in args[0].items() for kv in item)
+
if len(args) % 2 != 0 or len(args) < 2:
raise ValueError(
"Provide an even amount of arguments as "
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 9dfa4a9486558..f584c0ff9f614 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -391,10 +391,11 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
# clip `covxy / divisor` to ensure coeff is within bounds
if divisor != 0:
val = covxy / divisor
- if val > 1.0:
- val = 1.0
- elif val < -1.0:
- val = -1.0
+ if not cov:
+ if val > 1.0:
+ val = 1.0
+ elif val < -1.0:
+ val = -1.0
result[xi, yi] = result[yi, xi] = val
else:
result[xi, yi] = result[yi, xi] = NaN
diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi
index dda23d9dec98b..60e4ff3fab74e 100644
--- a/pandas/_libs/arrays.pyi
+++ b/pandas/_libs/arrays.pyi
@@ -1,4 +1,4 @@
-from typing import Sequence
+from collections.abc import Sequence
import numpy as np
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
index 7a810a988e50e..5ee359d84a6ed 100644
--- a/pandas/_libs/hashtable.pyi
+++ b/pandas/_libs/hashtable.pyi
@@ -1,6 +1,6 @@
+from collections.abc import Hashable
from typing import (
Any,
- Hashable,
Literal,
overload,
)
diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
index ffe6c7730bcdc..a680304d55ea2 100644
--- a/pandas/_libs/internals.pyi
+++ b/pandas/_libs/internals.pyi
@@ -1,6 +1,8 @@
-from typing import (
+from collections.abc import (
Iterator,
Sequence,
+)
+from typing import (
final,
overload,
)
diff --git a/pandas/_libs/json.pyi b/pandas/_libs/json.pyi
index bc4fe68573b94..349320d69d707 100644
--- a/pandas/_libs/json.pyi
+++ b/pandas/_libs/json.pyi
@@ -1,6 +1,6 @@
+from collections.abc import Callable
from typing import (
Any,
- Callable,
)
def ujson_dumps(
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index daaaacee3487d..331233f37f63d 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -1,12 +1,14 @@
# TODO(npdtypes): Many types specified here can be made more specific/accurate;
# the more specific versions are specified in comments
+from collections.abc import (
+ Callable,
+ Generator,
+ Hashable,
+)
from decimal import Decimal
from typing import (
Any,
- Callable,
Final,
- Generator,
- Hashable,
Literal,
TypeAlias,
overload,
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 38d9a8f62417c..3b7d659c2150e 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2,6 +2,7 @@ from collections import abc
from decimal import Decimal
from enum import Enum
from sys import getsizeof
+from types import GenericAlias
from typing import (
Literal,
_GenericAlias,
@@ -777,7 +778,10 @@ cpdef ndarray[object] ensure_string_array(
return out
arr = arr.to_numpy(dtype=object)
elif not util.is_array(arr):
- arr = np.array(arr, dtype="object")
+ # GH#61155: Guarantee a 1-d result when array is a list of lists
+ input_arr = arr
+ arr = np.empty(len(arr), dtype="object")
+ arr[:] = input_arr
result = np.asarray(arr, dtype="object")
@@ -1295,7 +1299,7 @@ cdef bint c_is_list_like(object obj, bint allow_sets) except -1:
getattr(obj, "__iter__", None) is not None and not isinstance(obj, type)
# we do not count strings/unicode/bytes as list-like
# exclude Generic types that have __iter__
- and not isinstance(obj, (str, bytes, _GenericAlias))
+ and not isinstance(obj, (str, bytes, _GenericAlias, GenericAlias))
# exclude zero-dimensional duck-arrays, effectively scalars
and not (hasattr(obj, "ndim") and obj.ndim == 0)
# exclude sets if allow_sets is False
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index a50976767928a..33fc65e5034d0 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -148,6 +148,12 @@ if get_option('buildtype') == 'debug'
cython_args += ['--gdb']
endif
+# Use shared utility code to reduce wheel sizes
+# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files
+if cy.version().version_compare('>=3.1.0')
+ cython_args += ['--shared=pandas._libs._cyutility']
+endif
+
foreach ext_name, ext_dict : libs_sources
py.extension_module(
ext_name,
diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
index 6738a1dff4a9e..81fe81930539d 100644
--- a/pandas/_libs/ops.pyi
+++ b/pandas/_libs/ops.pyi
@@ -1,7 +1,9 @@
-from typing import (
- Any,
+from collections.abc import (
Callable,
Iterable,
+)
+from typing import (
+ Any,
Literal,
TypeAlias,
overload,
diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi
index 253bb7303cefb..d18f54c546232 100644
--- a/pandas/_libs/parsers.pyi
+++ b/pandas/_libs/parsers.pyi
@@ -1,5 +1,5 @@
+from collections.abc import Hashable
from typing import (
- Hashable,
Literal,
)
diff --git a/pandas/_libs/properties.pyi b/pandas/_libs/properties.pyi
index aaa44a0cf47bf..bbde6ec454202 100644
--- a/pandas/_libs/properties.pyi
+++ b/pandas/_libs/properties.pyi
@@ -1,5 +1,5 @@
+from collections.abc import Sequence
from typing import (
- Sequence,
overload,
)
diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi
index 536265b25425e..8727b1a5b0420 100644
--- a/pandas/_libs/sparse.pyi
+++ b/pandas/_libs/sparse.pyi
@@ -1,4 +1,4 @@
-from typing import Sequence
+from collections.abc import Sequence
import numpy as np
diff --git a/pandas/_libs/testing.pyi b/pandas/_libs/testing.pyi
index ab87e58eba9b9..4758483b3b5e7 100644
--- a/pandas/_libs/testing.pyi
+++ b/pandas/_libs/testing.pyi
@@ -1,4 +1,4 @@
-from typing import Mapping
+from collections.abc import Mapping
def assert_dict_equal(a: Mapping, b: Mapping, compare_keys: bool = ...) -> bool: ...
def assert_almost_equal(
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
index c4acf72ab87d8..45552108f8c15 100644
--- a/pandas/_libs/tslibs/conversion.pyx
+++ b/pandas/_libs/tslibs/conversion.pyx
@@ -797,7 +797,7 @@ cdef int64_t parse_pydatetime(
dts : *npy_datetimestruct
Needed to use in pydatetime_to_dt64, which writes to it.
creso : NPY_DATETIMEUNIT
- Resolution to store the the result.
+ Resolution to store the result.
Raises
------
diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build
index 052a8568b76af..ac43dc7db5fb7 100644
--- a/pandas/_libs/tslibs/meson.build
+++ b/pandas/_libs/tslibs/meson.build
@@ -28,6 +28,12 @@ if get_option('buildtype') == 'debug'
cython_args += ['--gdb']
endif
+# Use shared utility code to reduce wheel sizes
+# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files
+if cy.version().version_compare('>=3.1.0')
+ cython_args += ['--shared=pandas._libs._cyutility']
+endif
+
foreach ext_name, ext_dict : tslibs_sources
py.extension_module(
ext_name,
diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi
index d3b10fbe79cb9..ff3bb5b70801e 100644
--- a/pandas/_libs/tslibs/nattype.pyi
+++ b/pandas/_libs/tslibs/nattype.pyi
@@ -1,7 +1,5 @@
from datetime import (
- date as date_,
datetime,
- time as time_,
timedelta,
tzinfo as _tzinfo,
)
@@ -99,7 +97,6 @@ class NaTType:
ambiguous: bool | Literal["raise"] | NaTType = ...,
nonexistent: TimestampNonexistent = ...,
) -> NaTType: ...
- def combine(cls, date: date_, time: time_) -> NoReturn: ...
@property
def tzinfo(self) -> None: ...
@property
diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi
index f9f56d38c5e0a..a71aa42b4f671 100644
--- a/pandas/_libs/tslibs/offsets.pyi
+++ b/pandas/_libs/tslibs/offsets.pyi
@@ -1,3 +1,4 @@
+from collections.abc import Collection
from datetime import (
datetime,
time,
@@ -5,7 +6,6 @@ from datetime import (
)
from typing import (
Any,
- Collection,
Literal,
TypeVar,
overload,
@@ -230,7 +230,13 @@ class FY5253Quarter(FY5253Mixin):
variation: Literal["nearest", "last"] = ...,
) -> None: ...
-class Easter(SingleConstructorOffset): ...
+class Easter(SingleConstructorOffset):
+ def __init__(
+ self,
+ n: int = ...,
+ normalize: bool = ...,
+ method: int = ...,
+ ) -> None: ...
class _CustomBusinessMonth(BusinessMixin):
def __init__(
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
index a16964435ef50..87214c3758d5c 100644
--- a/pandas/_libs/tslibs/offsets.pyx
+++ b/pandas/_libs/tslibs/offsets.pyx
@@ -4520,6 +4520,12 @@ cdef class Easter(SingleConstructorOffset):
The number of years represented.
normalize : bool, default False
Normalize start/end dates to midnight before generating date range.
+ method : int, default 3
+ The method used to calculate the date of Easter. Valid options are:
+ - 1 (EASTER_JULIAN): Original calculation in Julian calendar
+ - 2 (EASTER_ORTHODOX): Original method, date converted to Gregorian calendar
+ - 3 (EASTER_WESTERN): Revised method, in Gregorian calendar
+ These constants are defined in the `dateutil.easter` module.
See Also
--------
@@ -4532,15 +4538,32 @@ cdef class Easter(SingleConstructorOffset):
Timestamp('2022-04-17 00:00:00')
"""
+ _attributes = tuple(["n", "normalize", "method"])
+
+ cdef readonly:
+ int method
+
+ from dateutil.easter import EASTER_WESTERN
+
+ def __init__(self, n=1, normalize=False, method=EASTER_WESTERN):
+ BaseOffset.__init__(self, n, normalize)
+
+ self.method = method
+
+ if method < 1 or method > 3:
+ raise ValueError(f"Method must be 1<=method<=3, got {method}")
+
cpdef __setstate__(self, state):
+ from dateutil.easter import EASTER_WESTERN
self.n = state.pop("n")
self.normalize = state.pop("normalize")
+ self.method = state.pop("method", EASTER_WESTERN)
@apply_wraps
def _apply(self, other: datetime) -> datetime:
from dateutil.easter import easter
- current_easter = easter(other.year)
+ current_easter = easter(other.year, method=self.method)
current_easter = datetime(
current_easter.year, current_easter.month, current_easter.day
)
@@ -4555,7 +4578,7 @@ cdef class Easter(SingleConstructorOffset):
# NOTE: easter returns a datetime.date so we have to convert to type of
# other
- new = easter(other.year + n)
+ new = easter(other.year + n, method=self.method)
new = datetime(
new.year,
new.month,
@@ -4573,7 +4596,7 @@ cdef class Easter(SingleConstructorOffset):
from dateutil.easter import easter
- return date(dt.year, dt.month, dt.day) == easter(dt.year)
+ return date(dt.year, dt.month, dt.day) == easter(dt.year, method=self.method)
# ----------------------------------------------------------------------
diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
index fb89f1328529d..b443aa7bede22 100644
--- a/pandas/_libs/tslibs/strptime.pyx
+++ b/pandas/_libs/tslibs/strptime.pyx
@@ -444,6 +444,9 @@ def array_strptime(
else:
val = str(val)
+ out_local = 0
+ out_tzoffset = 0
+
if fmt == "ISO8601":
string_to_dts_succeeded = not string_to_dts(
val, &dts, &out_bestunit, &out_local,
diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi
index 979a5666661b2..c885543b2fc6d 100644
--- a/pandas/_libs/tslibs/timedeltas.pyi
+++ b/pandas/_libs/tslibs/timedeltas.pyi
@@ -3,7 +3,6 @@ from typing import (
ClassVar,
Literal,
TypeAlias,
- TypeVar,
overload,
)
@@ -60,7 +59,6 @@ UnitChoices: TypeAlias = Literal[
"nanos",
"nanosecond",
]
-_S = TypeVar("_S", bound=timedelta)
def get_unit_for_round(freq, creso: int) -> int: ...
def disallow_ambiguous_unit(unit: str | None) -> None: ...
@@ -95,11 +93,11 @@ class Timedelta(timedelta):
_value: int # np.int64
# error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]")
def __new__( # type: ignore[misc]
- cls: type[_S],
+ cls: type[Self],
value=...,
unit: str | None = ...,
**kwargs: float | np.integer | np.floating,
- ) -> _S | NaTType: ...
+ ) -> Self | NaTType: ...
@classmethod
def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ...
@property
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index 23197b9a55afc..390267db8267f 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -200,8 +200,9 @@ class MinMaxReso:
See also: timedeltas.MinMaxReso
"""
- def __init__(self, name):
+ def __init__(self, name, docstring):
self._name = name
+ self.__doc__ = docstring
def __get__(self, obj, type=None):
cls = Timestamp
@@ -216,11 +217,15 @@ class MinMaxReso:
if obj is None:
# i.e. this is on the class, default to nanos
- return cls(val)
+ result = cls(val)
elif self._name == "resolution":
- return Timedelta._from_value_and_reso(val, obj._creso)
+ result = Timedelta._from_value_and_reso(val, obj._creso)
else:
- return Timestamp._from_value_and_reso(val, obj._creso, tz=None)
+ result = Timestamp._from_value_and_reso(val, obj._creso, tz=None)
+
+ result.__doc__ = self.__doc__
+
+ return result
def __set__(self, obj, value):
raise AttributeError(f"{self._name} is not settable.")
@@ -235,9 +240,74 @@ cdef class _Timestamp(ABCTimestamp):
dayofweek = _Timestamp.day_of_week
dayofyear = _Timestamp.day_of_year
- min = MinMaxReso("min")
- max = MinMaxReso("max")
- resolution = MinMaxReso("resolution") # GH#21336, GH#21365
+ _docstring_min = """
+ Returns the minimum bound possible for Timestamp.
+
+ This property provides access to the smallest possible value that
+ can be represented by a Timestamp object.
+
+ Returns
+ -------
+ Timestamp
+
+ See Also
+ --------
+ Timestamp.max: Returns the maximum bound possible for Timestamp.
+ Timestamp.resolution: Returns the smallest possible difference between
+ non-equal Timestamp objects.
+
+ Examples
+ --------
+ >>> pd.Timestamp.min
+ Timestamp('1677-09-21 00:12:43.145224193')
+ """
+
+ _docstring_max = """
+ Returns the maximum bound possible for Timestamp.
+
+ This property provides access to the largest possible value that
+ can be represented by a Timestamp object.
+
+ Returns
+ -------
+ Timestamp
+
+ See Also
+ --------
+ Timestamp.min: Returns the minimum bound possible for Timestamp.
+ Timestamp.resolution: Returns the smallest possible difference between
+ non-equal Timestamp objects.
+
+ Examples
+ --------
+ >>> pd.Timestamp.max
+ Timestamp('2262-04-11 23:47:16.854775807')
+ """
+
+ _docstring_reso = """
+ Returns the smallest possible difference between non-equal Timestamp objects.
+
+ The resolution value is determined by the underlying representation of time
+ units and is equivalent to Timedelta(nanoseconds=1).
+
+ Returns
+ -------
+ Timedelta
+
+ See Also
+ --------
+ Timestamp.max: Returns the maximum bound possible for Timestamp.
+ Timestamp.min: Returns the minimum bound possible for Timestamp.
+
+ Examples
+ --------
+ >>> pd.Timestamp.resolution
+ Timedelta('0 days 00:00:00.000000001')
+ """
+
+ min = MinMaxReso("min", _docstring_min)
+ max = MinMaxReso("max", _docstring_max)
+ resolution = MinMaxReso("resolution", _docstring_reso) # GH#21336, GH#21365
@property
def value(self) -> int:
diff --git a/pandas/_libs/tslibs/timezones.pyi b/pandas/_libs/tslibs/timezones.pyi
index 4e9f0c6ae6c33..26ffa568a8480 100644
--- a/pandas/_libs/tslibs/timezones.pyi
+++ b/pandas/_libs/tslibs/timezones.pyi
@@ -1,8 +1,8 @@
+from collections.abc import Callable
from datetime import (
datetime,
tzinfo,
)
-from typing import Callable
import numpy as np
diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi
index 2108fa0f35547..07ee46858577a 100644
--- a/pandas/_libs/tslibs/tzconversion.pyi
+++ b/pandas/_libs/tslibs/tzconversion.pyi
@@ -1,8 +1,8 @@
+from collections.abc import Iterable
from datetime import (
timedelta,
tzinfo,
)
-from typing import Iterable
import numpy as np
diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
index b4bdd7e05cf0e..99413751cd5c2 100644
--- a/pandas/_libs/window/aggregations.pyi
+++ b/pandas/_libs/window/aggregations.pyi
@@ -1,6 +1,6 @@
+from collections.abc import Callable
from typing import (
Any,
- Callable,
Literal,
)
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
index 2baed13cbd7be..0c8ea28b60ce8 100644
--- a/pandas/_libs/window/aggregations.pyx
+++ b/pandas/_libs/window/aggregations.pyx
@@ -6,6 +6,7 @@ from libc.math cimport (
sqrt,
)
from libcpp.deque cimport deque
+from libcpp.stack cimport stack
from libcpp.unordered_map cimport unordered_map
from pandas._libs.algos cimport TiebreakEnumType
@@ -988,39 +989,29 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
# ----------------------------------------------------------------------
-# Moving maximum / minimum code taken from Bottleneck
-# Licence at LICENSES/BOTTLENECK_LICENCE
-
-
-cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) noexcept nogil:
-
- if ai == ai:
- nobs[0] = nobs[0] + 1
- elif is_max:
- ai = MINfloat64
- else:
- ai = MAXfloat64
-
- return ai
-
-
-cdef void remove_mm(float64_t aold, Py_ssize_t *nobs) noexcept nogil:
- """ remove a value from the mm calc """
- if aold == aold:
- nobs[0] = nobs[0] - 1
-
-
-cdef float64_t calc_mm(int64_t minp, Py_ssize_t nobs,
- float64_t value) noexcept nogil:
- cdef:
- float64_t result
+cdef int64_t bisect_left(
+ deque[int64_t]& a,
+ int64_t x,
+ int64_t lo=0,
+ int64_t hi=-1
+) nogil:
+ """Same as https://docs.python.org/3/library/bisect.html."""
+
+ cdef int64_t mid
+ if hi == -1:
+ hi = a.size()
+ while lo < hi:
+ mid = (lo + hi) // 2
+ if a.at(mid) < x:
+ lo = mid + 1
+ else:
+ hi = mid
+ return lo
- if nobs >= minp:
- result = value
- else:
- result = NaN
+from libc.math cimport isnan
- return result
+# Prior version of moving maximum / minimum code taken from Bottleneck
+# Licence at LICENSES/BOTTLENECK_LICENCE
def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
@@ -1068,69 +1059,110 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
return _roll_min_max(values, start, end, minp, is_max=0)
-cdef _roll_min_max(ndarray[float64_t] values,
- ndarray[int64_t] starti,
- ndarray[int64_t] endi,
- int64_t minp,
- bint is_max):
+def _roll_min_max(
+ ndarray[float64_t] values,
+ ndarray[int64_t] start,
+ ndarray[int64_t] end,
+ int64_t minp,
+ bint is_max
+):
cdef:
- float64_t ai
- int64_t curr_win_size, start
- Py_ssize_t i, k, nobs = 0, N = len(starti)
- deque Q[int64_t] # min/max always the front
- deque W[int64_t] # track the whole window for nobs compute
+ Py_ssize_t i, i_next, k, valid_start, last_end, last_start, N = len(start)
+ # Indices of bounded extrema in `values`. `candidates[i]` is always increasing.
+ # `values[candidates[i]]` is decreasing for max and increasing for min.
+ deque candidates[int64_t]
+ # Indices of largest windows that "cover" preceding windows.
+ stack dominators[int64_t]
ndarray[float64_t, ndim=1] output
+ Py_ssize_t this_start, this_end, stash_start
+ int64_t q_idx
+
output = np.empty(N, dtype=np.float64)
- Q = deque[int64_t]()
- W = deque[int64_t]()
+ candidates = deque[int64_t]()
+ dominators = stack[int64_t]()
+
+ # This function was "ported" / translated from sliding_min_max()
+ # in /pandas/core/_numba/kernels/min_max_.py.
+ # (See there for credits and some comments.)
+ # Code translation assumptions/rules:
+ # - min_periods --> minp
+ # - deque[0] --> front()
+ # - deque[-1] --> back()
+ # - stack[-1] --> top()
+ # - bool(stack/deque) --> !empty()
+ # - deque.append() --> push_back()
+ # - stack.append() --> push()
+ # - deque.popleft --> pop_front()
+ # - deque.pop() --> pop_back()
with nogil:
+ if minp < 1:
+ minp = 1
+
+ if N>2:
+ i_next = N - 1
+ for i in range(N - 2, -1, -1):
+ if start[i_next] < start[i] \
+ and (
+ dominators.empty()
+ or start[dominators.top()] > start[i_next]
+ ):
+ dominators.push(i_next)
+ i_next = i
+
+ # NaN tracking to guarantee minp
+ valid_start = -minp
+
+ last_end = 0
+ last_start = -1
- # This is using a modified version of the C++ code in this
- # SO post: https://stackoverflow.com/a/12239580
- # The original impl didn't deal with variable window sizes
- # So the code was optimized for that
-
- # first window's size
- curr_win_size = endi[0] - starti[0]
- # GH 32865
- # Anchor output index to values index to provide custom
- # BaseIndexer support
for i in range(N):
+ this_start = start[i]
+ this_end = end[i]
- curr_win_size = endi[i] - starti[i]
- if i == 0:
- start = starti[i]
- else:
- start = endi[i - 1]
-
- for k in range(start, endi[i]):
- ai = init_mm(values[k], &nobs, is_max)
- # Discard previous entries if we find new min or max
- if is_max:
- while not Q.empty() and ((ai >= values[Q.back()]) or
- values[Q.back()] != values[Q.back()]):
- Q.pop_back()
- else:
- while not Q.empty() and ((ai <= values[Q.back()]) or
- values[Q.back()] != values[Q.back()]):
- Q.pop_back()
- Q.push_back(k)
- W.push_back(k)
-
- # Discard entries outside and left of current window
- while not Q.empty() and Q.front() <= starti[i] - 1:
- Q.pop_front()
- while not W.empty() and W.front() <= starti[i] - 1:
- remove_mm(values[W.front()], &nobs)
- W.pop_front()
-
- # Save output based on index in input value array
- if not Q.empty() and curr_win_size > 0:
- output[i] = calc_mm(minp, nobs, values[Q.front()])
+ if (not dominators.empty() and dominators.top() == i):
+ dominators.pop()
+
+ if not (this_end > last_end
+ or (this_end == last_end and this_start >= last_start)):
+ raise ValueError(
+ "Start/End ordering requirement is violated at index {}".format(i))
+
+ if dominators.empty():
+ stash_start = this_start
else:
+ stash_start = min(this_start, start[dominators.top()])
+
+ while not candidates.empty() and candidates.front() < stash_start:
+ candidates.pop_front()
+
+ for k in range(last_end, this_end):
+ if not isnan(values[k]):
+ valid_start += 1
+ while valid_start >= 0 and isnan(values[valid_start]):
+ valid_start += 1
+
+ if is_max:
+ while (not candidates.empty()
+ and values[k] >= values[candidates.back()]):
+ candidates.pop_back()
+ else:
+ while (not candidates.empty()
+ and values[k] <= values[candidates.back()]):
+ candidates.pop_back()
+ candidates.push_back(k)
+
+ if candidates.empty() or this_start > valid_start:
output[i] = NaN
+ elif candidates.front() >= this_start:
+ # ^^ This is here to avoid costly bisection for fixed window sizes.
+ output[i] = values[candidates.front()]
+ else:
+ q_idx = bisect_left(candidates, this_start, lo=1)
+ output[i] = values[candidates[q_idx]]
+ last_end = this_end
+ last_start = this_start
return output
@@ -1322,8 +1354,8 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
if interpolation_type == LINEAR:
vlow = skiplist_get(skiplist, idx, &ret)
vhigh = skiplist_get(skiplist, idx + 1, &ret)
- output[i] = ((vlow + (vhigh - vlow) *
- (idx_with_fraction - idx)))
+ output[i] = (vlow + (vhigh - vlow) *
+ (idx_with_fraction - idx))
elif interpolation_type == LOWER:
output[i] = skiplist_get(skiplist, idx, &ret)
elif interpolation_type == HIGHER:
diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build
index 1d49bba47e139..8c00a98b1241a 100644
--- a/pandas/_libs/window/meson.build
+++ b/pandas/_libs/window/meson.build
@@ -1,7 +1,14 @@
+cy_args = ['-X always_allow_keywords=true']
+# Use shared utility code to reduce wheel sizes
+# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files
+if cy.version().version_compare('>=3.1.0')
+ cython_args += ['--shared=pandas._libs._cyutility']
+endif
+
py.extension_module(
'aggregations',
['aggregations.pyx'],
- cython_args: ['-X always_allow_keywords=true'],
+ cython_args: cy_args,
include_directories: [inc_np, inc_pd],
subdir: 'pandas/_libs/window',
override_options: ['cython_language=cpp'],
@@ -11,7 +18,7 @@ py.extension_module(
py.extension_module(
'indexers',
['indexers.pyx'],
- cython_args: ['-X always_allow_keywords=true'],
+ cython_args: cy_args,
include_directories: [inc_np, inc_pd],
subdir: 'pandas/_libs/window',
install: true,
diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
index 99826de51e1bf..da147c117ad43 100644
--- a/pandas/_testing/contexts.py
+++ b/pandas/_testing/contexts.py
@@ -3,6 +3,7 @@
from contextlib import contextmanager
import os
from pathlib import Path
+import sys
import tempfile
from typing import (
IO,
@@ -81,7 +82,9 @@ def setTZ(tz) -> None:
pass
else:
os.environ["TZ"] = tz
- time.tzset()
+ # Next line allows typing checks to pass on Windows
+ if sys.platform != "win32":
+ time.tzset()
orig_tz = os.environ.get("TZ")
setTZ(tz)
diff --git a/pandas/_typing.py b/pandas/_typing.py
index 4365ee85f72e3..889252bb00438 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+from builtins import type as type_t # pyright: ignore[reportUnusedImport]
from collections.abc import (
Callable,
Hashable,
@@ -20,22 +21,23 @@
TYPE_CHECKING,
Any,
Literal,
- Optional,
Protocol,
- Type as type_t,
+ TypeAlias,
TypeVar,
Union,
overload,
)
import numpy as np
+import numpy.typing as npt
# To prevent import cycles place any internal imports in the branch below
# and use a string literal forward reference to it in subsequent types
# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
-if TYPE_CHECKING:
- import numpy.typing as npt
+# Note that Union is needed when a Union includes a pandas type
+
+if TYPE_CHECKING:
from pandas._libs import (
NaTType,
Period,
@@ -76,19 +78,12 @@
from pandas.io.formats.format import EngFormatter
from pandas.tseries.holiday import AbstractHolidayCalendar
- ScalarLike_co = Union[
- int,
- float,
- complex,
- str,
- bytes,
- np.generic,
- ]
+ ScalarLike_co: TypeAlias = int | float | complex | str | bytes | np.generic
# numpy compatible types
- NumpyValueArrayLike = Union[ScalarLike_co, npt.ArrayLike]
+ NumpyValueArrayLike: TypeAlias = ScalarLike_co | npt.ArrayLike
# Name "npt._ArrayLikeInt_co" is not defined [name-defined]
- NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined]
+ NumpySorter: TypeAlias = npt._ArrayLikeInt_co | None # type: ignore[name-defined]
from typing import (
ParamSpec,
@@ -107,7 +102,6 @@
from typing_extensions import Unpack # pyright: ignore[reportUnusedImport]
else:
- npt: Any = None
ParamSpec: Any = None
Self: Any = None
TypeGuard: Any = None
@@ -120,10 +114,10 @@
# array-like
-ArrayLike = Union["ExtensionArray", np.ndarray]
+ArrayLike: TypeAlias = Union["ExtensionArray", np.ndarray]
ArrayLikeT = TypeVar("ArrayLikeT", "ExtensionArray", np.ndarray)
-AnyArrayLike = Union[ArrayLike, "Index", "Series"]
-TimeArrayLike = Union["DatetimeArray", "TimedeltaArray"]
+AnyArrayLike: TypeAlias = Union[ArrayLike, "Index", "Series"]
+TimeArrayLike: TypeAlias = Union["DatetimeArray", "TimedeltaArray"]
# list-like
@@ -152,31 +146,31 @@ def count(self, value: Any, /) -> int: ...
def __reversed__(self) -> Iterator[_T_co]: ...
-ListLike = Union[AnyArrayLike, SequenceNotStr, range]
+ListLike: TypeAlias = AnyArrayLike | SequenceNotStr | range
# scalars
-PythonScalar = Union[str, float, bool]
-DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"]
-PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"]
-Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, date]
-IntStrT = TypeVar("IntStrT", bound=Union[int, str])
-
+PythonScalar: TypeAlias = str | float | bool
+DatetimeLikeScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta"]
+PandasScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta", "Interval"]
+Scalar: TypeAlias = PythonScalar | PandasScalar | np.datetime64 | np.timedelta64 | date
+IntStrT = TypeVar("IntStrT", bound=int | str)
# timestamp and timedelta convertible types
-TimestampConvertibleTypes = Union[
+TimestampConvertibleTypes: TypeAlias = Union[
"Timestamp", date, np.datetime64, np.int64, float, str
]
-TimestampNonexistent = Union[
- Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta
-]
-TimedeltaConvertibleTypes = Union[
+TimestampNonexistent: TypeAlias = (
+ Literal["shift_forward", "shift_backward", "NaT", "raise"] | timedelta
+)
+
+TimedeltaConvertibleTypes: TypeAlias = Union[
"Timedelta", timedelta, np.timedelta64, np.int64, float, str
]
-Timezone = Union[str, tzinfo]
+Timezone: TypeAlias = str | tzinfo
-ToTimestampHow = Literal["s", "e", "start", "end"]
+ToTimestampHow: TypeAlias = Literal["s", "e", "start", "end"]
# NDFrameT is stricter and ensures that the same subclass of NDFrame always is
# used. E.g. `def func(a: NDFrameT) -> NDFrameT: ...` means that if a
@@ -188,69 +182,66 @@ def __reversed__(self) -> Iterator[_T_co]: ...
FreqIndexT = TypeVar("FreqIndexT", "DatetimeIndex", "PeriodIndex", "TimedeltaIndex")
NumpyIndexT = TypeVar("NumpyIndexT", np.ndarray, "Index")
-AxisInt = int
-Axis = Union[AxisInt, Literal["index", "columns", "rows"]]
-IndexLabel = Union[Hashable, Sequence[Hashable]]
-Level = Hashable
-Shape = tuple[int, ...]
-Suffixes = Sequence[Optional[str]]
-Ordered = Optional[bool]
-JSONSerializable = Optional[Union[PythonScalar, list, dict]]
-Frequency = Union[str, "BaseOffset"]
-Axes = ListLike
-
-RandomState = Union[
- int,
- np.ndarray,
- np.random.Generator,
- np.random.BitGenerator,
- np.random.RandomState,
-]
+AxisInt: TypeAlias = int
+Axis: TypeAlias = AxisInt | Literal["index", "columns", "rows"]
+IndexLabel: TypeAlias = Hashable | Sequence[Hashable]
+Level: TypeAlias = Hashable
+Shape: TypeAlias = tuple[int, ...]
+Suffixes: TypeAlias = Sequence[str | None]
+Ordered: TypeAlias = bool | None
+JSONSerializable: TypeAlias = PythonScalar | list | dict | None
+Frequency: TypeAlias = Union[str, "BaseOffset"]
+Axes: TypeAlias = ListLike
+
+RandomState: TypeAlias = (
+ int
+ | np.ndarray
+ | np.random.Generator
+ | np.random.BitGenerator
+ | np.random.RandomState
+)
+
# dtypes
-NpDtype = Union[str, np.dtype, type_t[Union[str, complex, bool, object]]]
-Dtype = Union["ExtensionDtype", NpDtype]
-AstypeArg = Union["ExtensionDtype", "npt.DTypeLike"]
+NpDtype: TypeAlias = str | np.dtype | type[str | complex | bool | object]
+Dtype: TypeAlias = Union["ExtensionDtype", NpDtype]
+AstypeArg: TypeAlias = Union["ExtensionDtype", npt.DTypeLike]
# DtypeArg specifies all allowable dtypes in a functions its dtype argument
-DtypeArg = Union[Dtype, Mapping[Hashable, Dtype]]
-DtypeObj = Union[np.dtype, "ExtensionDtype"]
+DtypeArg: TypeAlias = Dtype | Mapping[Hashable, Dtype]
+DtypeObj: TypeAlias = Union[np.dtype, "ExtensionDtype"]
# converters
-ConvertersArg = dict[Hashable, Callable[[Dtype], Dtype]]
+ConvertersArg: TypeAlias = dict[Hashable, Callable[[Dtype], Dtype]]
# parse_dates
-ParseDatesArg = Union[
- bool, list[Hashable], list[list[Hashable]], dict[Hashable, list[Hashable]]
-]
+ParseDatesArg: TypeAlias = (
+ bool | list[Hashable] | list[list[Hashable]] | dict[Hashable, list[Hashable]]
+)
# For functions like rename that convert one label to another
-Renamer = Union[Mapping[Any, Hashable], Callable[[Any], Hashable]]
+Renamer: TypeAlias = Mapping[Any, Hashable] | Callable[[Any], Hashable]
# to maintain type information across generic functions and parametrization
T = TypeVar("T")
# used in decorators to preserve the signature of the function it decorates
# see https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators
-FuncType = Callable[..., Any]
+FuncType: TypeAlias = Callable[..., Any]
F = TypeVar("F", bound=FuncType)
TypeT = TypeVar("TypeT", bound=type)
# types of vectorized key functions for DataFrame::sort_values and
# DataFrame::sort_index, among others
-ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]]
-IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]]
+ValueKeyFunc: TypeAlias = Callable[["Series"], Union["Series", AnyArrayLike]] | None
+IndexKeyFunc: TypeAlias = Callable[["Index"], Union["Index", AnyArrayLike]] | None
# types of `func` kwarg for DataFrame.aggregate and Series.aggregate
-AggFuncTypeBase = Union[Callable, str]
-AggFuncTypeDict = MutableMapping[
- Hashable, Union[AggFuncTypeBase, list[AggFuncTypeBase]]
+AggFuncTypeBase: TypeAlias = Callable | str
+AggFuncTypeDict: TypeAlias = MutableMapping[
+ Hashable, AggFuncTypeBase | list[AggFuncTypeBase]
]
-AggFuncType = Union[
- AggFuncTypeBase,
- list[AggFuncTypeBase],
- AggFuncTypeDict,
-]
-AggObjType = Union[
+AggFuncType: TypeAlias = AggFuncTypeBase | list[AggFuncTypeBase] | AggFuncTypeDict
+AggObjType: TypeAlias = Union[
"Series",
"DataFrame",
"GroupBy",
@@ -260,7 +251,7 @@ def __reversed__(self) -> Iterator[_T_co]: ...
"Resampler",
]
-PythonFuncType = Callable[[Any], Any]
+PythonFuncType: TypeAlias = Callable[[Any], Any]
# filenames and file-like-objects
AnyStr_co = TypeVar("AnyStr_co", str, bytes, covariant=True)
@@ -330,31 +321,30 @@ def closed(self) -> bool:
...
-FilePath = Union[str, "PathLike[str]"]
+FilePath: TypeAlias = str | PathLike[str]
# for arbitrary kwargs passed during reading/writing files
-StorageOptions = Optional[dict[str, Any]]
-
+StorageOptions: TypeAlias = dict[str, Any] | None
# compression keywords and compression
-CompressionDict = dict[str, Any]
-CompressionOptions = Optional[
- Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], CompressionDict]
-]
+CompressionDict: TypeAlias = dict[str, Any]
+CompressionOptions: TypeAlias = (
+ Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"] | CompressionDict | None
+)
# types in DataFrameFormatter
-FormattersType = Union[
- list[Callable], tuple[Callable, ...], Mapping[Union[str, int], Callable]
-]
-ColspaceType = Mapping[Hashable, Union[str, int]]
-FloatFormatType = Union[str, Callable, "EngFormatter"]
-ColspaceArgType = Union[
- str, int, Sequence[Union[str, int]], Mapping[Hashable, Union[str, int]]
-]
+FormattersType: TypeAlias = (
+ list[Callable] | tuple[Callable, ...] | Mapping[str | int, Callable]
+)
+ColspaceType: TypeAlias = Mapping[Hashable, str | int]
+FloatFormatType: TypeAlias = Union[str, Callable, "EngFormatter"]
+ColspaceArgType: TypeAlias = (
+ str | int | Sequence[str | int] | Mapping[Hashable, str | int]
+)
# Arguments for fillna()
-FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"]
-InterpolateOptions = Literal[
+FillnaOptions: TypeAlias = Literal["backfill", "bfill", "ffill", "pad"]
+InterpolateOptions: TypeAlias = Literal[
"linear",
"time",
"index",
@@ -376,7 +366,7 @@ def closed(self) -> bool:
]
# internals
-Manager = Union["BlockManager", "SingleBlockManager"]
+Manager: TypeAlias = Union["BlockManager", "SingleBlockManager"]
# indexing
# PositionalIndexer -> valid 1D positional indexer, e.g. can pass
@@ -389,63 +379,62 @@ def closed(self) -> bool:
# https://github.com/python/typing/issues/684#issuecomment-548203158
# https://bugs.python.org/issue41810
# Using List[int] here rather than Sequence[int] to disallow tuples.
-ScalarIndexer = Union[int, np.integer]
-SequenceIndexer = Union[slice, list[int], np.ndarray]
-PositionalIndexer = Union[ScalarIndexer, SequenceIndexer]
-PositionalIndexerTuple = tuple[PositionalIndexer, PositionalIndexer]
-PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple]
-if TYPE_CHECKING:
- TakeIndexer = Union[Sequence[int], Sequence[np.integer], npt.NDArray[np.integer]]
-else:
- TakeIndexer = Any
+ScalarIndexer: TypeAlias = int | np.integer
+SequenceIndexer: TypeAlias = slice | list[int] | np.ndarray
+PositionalIndexer: TypeAlias = ScalarIndexer | SequenceIndexer
+PositionalIndexerTuple: TypeAlias = tuple[PositionalIndexer, PositionalIndexer]
+PositionalIndexer2D: TypeAlias = PositionalIndexer | PositionalIndexerTuple
+TakeIndexer: TypeAlias = Sequence[int] | Sequence[np.integer] | npt.NDArray[np.integer]
# Shared by functions such as drop and astype
-IgnoreRaise = Literal["ignore", "raise"]
+IgnoreRaise: TypeAlias = Literal["ignore", "raise"]
# Windowing rank methods
-WindowingRankType = Literal["average", "min", "max"]
+WindowingRankType: TypeAlias = Literal["average", "min", "max"]
# read_csv engines
-CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
+CSVEngine: TypeAlias = Literal["c", "python", "pyarrow", "python-fwf"]
# read_json engines
-JSONEngine = Literal["ujson", "pyarrow"]
+JSONEngine: TypeAlias = Literal["ujson", "pyarrow"]
# read_xml parsers
-XMLParsers = Literal["lxml", "etree"]
+XMLParsers: TypeAlias = Literal["lxml", "etree"]
# read_html flavors
-HTMLFlavors = Literal["lxml", "html5lib", "bs4"]
+HTMLFlavors: TypeAlias = Literal["lxml", "html5lib", "bs4"]
# Interval closed type
-IntervalLeftRight = Literal["left", "right"]
-IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]]
+IntervalLeftRight: TypeAlias = Literal["left", "right"]
+IntervalClosedType: TypeAlias = IntervalLeftRight | Literal["both", "neither"]
# datetime and NaTType
-DatetimeNaTType = Union[datetime, "NaTType"]
-DateTimeErrorChoices = Literal["raise", "coerce"]
+DatetimeNaTType: TypeAlias = Union[datetime, "NaTType"]
+DateTimeErrorChoices: TypeAlias = Literal["raise", "coerce"]
# sort_index
-SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
-NaPosition = Literal["first", "last"]
+SortKind: TypeAlias = Literal["quicksort", "mergesort", "heapsort", "stable"]
+NaPosition: TypeAlias = Literal["first", "last"]
# Arguments for nsmallest and nlargest
-NsmallestNlargestKeep = Literal["first", "last", "all"]
+NsmallestNlargestKeep: TypeAlias = Literal["first", "last", "all"]
# quantile interpolation
-QuantileInterpolation = Literal["linear", "lower", "higher", "midpoint", "nearest"]
+QuantileInterpolation: TypeAlias = Literal[
+ "linear", "lower", "higher", "midpoint", "nearest"
+]
# plotting
-PlottingOrientation = Literal["horizontal", "vertical"]
+PlottingOrientation: TypeAlias = Literal["horizontal", "vertical"]
# dropna
-AnyAll = Literal["any", "all"]
+AnyAll: TypeAlias = Literal["any", "all"]
# merge
-MergeHow = Literal[
+MergeHow: TypeAlias = Literal[
"left", "right", "inner", "outer", "cross", "left_anti", "right_anti"
]
-MergeValidate = Literal[
+MergeValidate: TypeAlias = Literal[
"one_to_one",
"1:1",
"one_to_many",
@@ -457,8 +446,8 @@ def closed(self) -> bool:
]
# join
-JoinHow = Literal["left", "right", "inner", "outer"]
-JoinValidate = Literal[
+JoinHow: TypeAlias = Literal["left", "right", "inner", "outer"]
+JoinValidate: TypeAlias = Literal[
"one_to_one",
"1:1",
"one_to_many",
@@ -470,25 +459,28 @@ def closed(self) -> bool:
]
# reindex
-ReindexMethod = Union[FillnaOptions, Literal["nearest"]]
+ReindexMethod: TypeAlias = FillnaOptions | Literal["nearest"]
-MatplotlibColor = Union[str, Sequence[float]]
-TimeGrouperOrigin = Union[
+MatplotlibColor: TypeAlias = str | Sequence[float]
+TimeGrouperOrigin: TypeAlias = Union[
"Timestamp", Literal["epoch", "start", "start_day", "end", "end_day"]
]
-TimeAmbiguous = Union[Literal["infer", "NaT", "raise"], "npt.NDArray[np.bool_]"]
-TimeNonexistent = Union[
- Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta
-]
-DropKeep = Literal["first", "last", False]
-CorrelationMethod = Union[
- Literal["pearson", "kendall", "spearman"], Callable[[np.ndarray, np.ndarray], float]
-]
-AlignJoin = Literal["outer", "inner", "left", "right"]
-DtypeBackend = Literal["pyarrow", "numpy_nullable"]
+TimeAmbiguous: TypeAlias = Literal["infer", "NaT", "raise"] | npt.NDArray[np.bool_]
+TimeNonexistent: TypeAlias = (
+ Literal["shift_forward", "shift_backward", "NaT", "raise"] | timedelta
+)
+
+DropKeep: TypeAlias = Literal["first", "last", False]
+CorrelationMethod: TypeAlias = (
+ Literal["pearson", "kendall", "spearman"]
+ | Callable[[np.ndarray, np.ndarray], float]
+)
-TimeUnit = Literal["s", "ms", "us", "ns"]
-OpenFileErrors = Literal[
+AlignJoin: TypeAlias = Literal["outer", "inner", "left", "right"]
+DtypeBackend: TypeAlias = Literal["pyarrow", "numpy_nullable"]
+
+TimeUnit: TypeAlias = Literal["s", "ms", "us", "ns"]
+OpenFileErrors: TypeAlias = Literal[
"strict",
"ignore",
"replace",
@@ -499,34 +491,32 @@ def closed(self) -> bool:
]
# update
-UpdateJoin = Literal["left"]
+UpdateJoin: TypeAlias = Literal["left"]
# applymap
-NaAction = Literal["ignore"]
+NaAction: TypeAlias = Literal["ignore"]
# from_dict
-FromDictOrient = Literal["columns", "index", "tight"]
+FromDictOrient: TypeAlias = Literal["columns", "index", "tight"]
# to_stata
-ToStataByteorder = Literal[">", "<", "little", "big"]
+ToStataByteorder: TypeAlias = Literal[">", "<", "little", "big"]
# ExcelWriter
-ExcelWriterIfSheetExists = Literal["error", "new", "replace", "overlay"]
-ExcelWriterMergeCells = Union[bool, Literal["columns"]]
+ExcelWriterIfSheetExists: TypeAlias = Literal["error", "new", "replace", "overlay"]
+ExcelWriterMergeCells: TypeAlias = bool | Literal["columns"]
# Offsets
-OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"]
+OffsetCalendar: TypeAlias = Union[np.busdaycalendar, "AbstractHolidayCalendar"]
# read_csv: usecols
-UsecolsArgType = Union[
- SequenceNotStr[Hashable],
- range,
- AnyArrayLike,
- Callable[[HashableT], bool],
- None,
-]
+UsecolsArgType: TypeAlias = (
+ SequenceNotStr[Hashable] | range | AnyArrayLike | Callable[[HashableT], bool] | None
+)
# maintain the sub-type of any hashable sequence
SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable])
-SliceType = Optional[Hashable]
+SliceType: TypeAlias = Hashable | None
+
+__all__ = ["type_t"]
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 6b90389a62056..f01dfab0de829 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -22,38 +22,38 @@
VERSIONS = {
"adbc-driver-postgresql": "0.10.0",
"adbc-driver-sqlite": "0.8.0",
- "bs4": "4.11.2",
- "blosc": "1.21.3",
+ "bs4": "4.12.3",
"bottleneck": "1.3.6",
- "fastparquet": "2023.10.0",
- "fsspec": "2022.11.0",
+ "fastparquet": "2024.2.0",
+ "fsspec": "2023.12.2",
"html5lib": "1.1",
"hypothesis": "6.84.0",
- "gcsfs": "2022.11.0",
- "jinja2": "3.1.2",
+ "gcsfs": "2023.12.2",
+ "jinja2": "3.1.3",
"lxml.etree": "4.9.2",
- "matplotlib": "3.6.3",
- "numba": "0.56.4",
- "numexpr": "2.8.4",
+ "matplotlib": "3.8.3",
+ "numba": "0.59.0",
+ "numexpr": "2.9.0",
"odfpy": "1.4.1",
- "openpyxl": "3.1.0",
+ "openpyxl": "3.1.2",
"psycopg2": "2.9.6", # (dt dec pq3 ext lo64)
- "pymysql": "1.0.2",
+ "pymysql": "1.1.0",
"pyarrow": "10.0.1",
- "pyreadstat": "1.2.0",
+ "pyiceberg": "0.7.1",
+ "pyreadstat": "1.2.6",
"pytest": "7.3.2",
"python-calamine": "0.1.7",
"pytz": "2023.4",
"pyxlsb": "1.0.10",
- "s3fs": "2022.11.0",
- "scipy": "1.10.0",
+ "s3fs": "2023.12.2",
+ "scipy": "1.12.0",
"sqlalchemy": "2.0.0",
"tables": "3.8.0",
"tabulate": "0.9.0",
- "xarray": "2022.12.0",
+ "xarray": "2024.1.1",
"xlrd": "2.0.1",
- "xlsxwriter": "3.0.5",
- "zstandard": "0.19.0",
+ "xlsxwriter": "3.2.0",
+ "zstandard": "0.22.0",
"tzdata": "2022.7",
"qtpy": "2.3.0",
"pyqt5": "5.15.9",
diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py
index 3306b36d71806..e95b44c879940 100644
--- a/pandas/compat/numpy/__init__.py
+++ b/pandas/compat/numpy/__init__.py
@@ -36,8 +36,8 @@
r".*In the future `np\.long` will be defined as.*",
FutureWarning,
)
- np_long = np.long # type: ignore[attr-defined]
- np_ulong = np.ulong # type: ignore[attr-defined]
+ np_long = np.long
+ np_ulong = np.ulong
except AttributeError:
np_long = np.int_
np_ulong = np.uint
diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py
index 68aa1446bbe3c..c03f20c871012 100644
--- a/pandas/core/_numba/kernels/min_max_.py
+++ b/pandas/core/_numba/kernels/min_max_.py
@@ -9,7 +9,10 @@
from __future__ import annotations
-from typing import TYPE_CHECKING
+from typing import (
+ TYPE_CHECKING,
+ Any,
+)
import numba
import numpy as np
@@ -18,6 +21,20 @@
from pandas._typing import npt
+@numba.njit(nogil=True, parallel=False)
+def bisect_left(a: list[Any], x: Any, lo: int = 0, hi: int = -1) -> int:
+ """Same as https://docs.python.org/3/library/bisect.html; not in numba yet!"""
+ if hi == -1:
+ hi = len(a)
+ while lo < hi:
+ mid = (lo + hi) // 2
+ if a[mid] < x:
+ lo = mid + 1
+ else:
+ hi = mid
+ return lo
+
+
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_min_max(
values: np.ndarray,
@@ -27,55 +44,87 @@ def sliding_min_max(
min_periods: int,
is_max: bool,
) -> tuple[np.ndarray, list[int]]:
+ # Basic idea of the algorithm: https://stackoverflow.com/a/12239580
+ # It was generalized to work with an arbitrary list of any window size and position
+ # by adding the Dominators stack.
+
N = len(start)
- nobs = 0
- output = np.empty(N, dtype=result_dtype)
na_pos = []
- # Use deque once numba supports it
- # https://github.com/numba/numba/issues/7417
- Q: list = []
- W: list = []
- for i in range(N):
- curr_win_size = end[i] - start[i]
- if i == 0:
- st = start[i]
- else:
- st = end[i - 1]
-
- for k in range(st, end[i]):
- ai = values[k]
- if not np.isnan(ai):
- nobs += 1
- elif is_max:
- ai = -np.inf
- else:
- ai = np.inf
- # Discard previous entries if we find new min or max
- if is_max:
- while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
- Q.pop()
- else:
- while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
- Q.pop()
- Q.append(k)
- W.append(k)
-
- # Discard entries outside and left of current window
- while Q and Q[0] <= start[i] - 1:
- Q.pop(0)
- while W and W[0] <= start[i] - 1:
- if not np.isnan(values[W[0]]):
- nobs -= 1
- W.pop(0)
-
- # Save output based on index in input value array
- if Q and curr_win_size > 0 and nobs >= min_periods:
- output[i] = values[Q[0]]
+ output = np.empty(N, dtype=result_dtype)
+
+ def cmp(a: Any, b: Any, is_max: bool) -> bool:
+ if is_max:
+ return a >= b
else:
+ return a <= b
+
+ # Indices of bounded extrema in `values`. `candidates[i]` is always increasing.
+ # `values[candidates[i]]` is decreasing for max and increasing for min.
+ candidates: list[int] = [] # this is a queue
+ # Indices of largest windows that "cover" preceding windows.
+ dominators: list[int] = [] # this is a stack
+
+ if min_periods < 1:
+ min_periods = 1
+
+ if N > 2:
+ i_next = N - 1 # equivalent to i_next = i+1 inside the loop
+ for i in range(N - 2, -1, -1):
+ next_dominates = start[i_next] < start[i]
+ if next_dominates and (
+ not dominators or start[dominators[-1]] > start[i_next]
+ ):
+ dominators.append(i_next)
+ i_next = i
+
+ # NaN tracking to guarantee min_periods
+ valid_start = -min_periods
+
+ last_end = 0
+ last_start = -1
+
+ for i in range(N):
+ this_start = start[i].item()
+ this_end = end[i].item()
+
+ if dominators and dominators[-1] == i:
+ dominators.pop()
+
+ if not (
+ this_end > last_end or (this_end == last_end and this_start >= last_start)
+ ):
+ raise ValueError(
+ "Start/End ordering requirement is violated at index " + str(i)
+ )
+
+ stash_start = (
+ this_start if not dominators else min(this_start, start[dominators[-1]])
+ )
+ while candidates and candidates[0] < stash_start:
+ candidates.pop(0)
+
+ for k in range(last_end, this_end):
+ if not np.isnan(values[k]):
+ valid_start += 1
+ while valid_start >= 0 and np.isnan(values[valid_start]):
+ valid_start += 1
+ while candidates and cmp(values[k], values[candidates[-1]], is_max):
+ candidates.pop() # Q.pop_back()
+ candidates.append(k) # Q.push_back(k)
+
+ if not candidates or (this_start > valid_start):
if values.dtype.kind != "i":
output[i] = np.nan
else:
na_pos.append(i)
+ elif candidates[0] >= this_start:
+ # ^^ This is here to avoid costly bisection for fixed window sizes.
+ output[i] = values[candidates[0]]
+ else:
+ q_idx = bisect_left(candidates, this_start, lo=1)
+ output[i] = values[candidates[q_idx]]
+ last_end = this_end
+ last_start = this_start
return output, na_pos
diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py
index 78684eacf2d66..0331c26c805b6 100644
--- a/pandas/core/accessor.py
+++ b/pandas/core/accessor.py
@@ -351,7 +351,7 @@ def register_dataframe_accessor(name: str) -> Callable[[TypeT], TypeT]:
AttributeError: The series must contain integer data only.
>>> df = pd.Series([1, 2, 3])
>>> df.int_accessor.sum()
-6"""
+np.int64(6)"""
@doc(_register_accessor, klass="Series", examples=_register_series_examples)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 76f2fdad591ff..7fc391d3ffb51 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -47,6 +47,7 @@
is_bool_dtype,
is_complex_dtype,
is_dict_like,
+ is_dtype_equal,
is_extension_array_dtype,
is_float,
is_float_dtype,
@@ -215,7 +216,7 @@ def _reconstruct_data(
values = cls._from_sequence(values, dtype=dtype) # type: ignore[assignment]
else:
- values = values.astype(dtype, copy=False)
+ values = values.astype(dtype, copy=False) # type: ignore[assignment]
return values
@@ -511,6 +512,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
len(values) > 0
and values.dtype.kind in "iufcb"
and not is_signed_integer_dtype(comps)
+ and not is_dtype_equal(values, comps)
):
# GH#46485 Use object to avoid upcast to float64 later
# TODO: Share with _find_common_type_compat
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index da6124307e3f1..2c96f1ef020ac 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -327,7 +327,7 @@ def transform(self) -> DataFrame | Series:
if is_series:
func = {com.get_callable_name(v) or v: v for v in func}
else:
- func = {col: func for col in obj}
+ func = dict.fromkeys(obj, func)
if is_dict_like(func):
func = cast(AggFuncTypeDict, func)
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
index 8a920d1849bb3..eb5026454552c 100644
--- a/pandas/core/array_algos/quantile.py
+++ b/pandas/core/array_algos/quantile.py
@@ -102,7 +102,7 @@ def quantile_with_mask(
interpolation=interpolation,
)
- result = np.asarray(result)
+ result = np.asarray(result) # type: ignore[assignment]
result = result.T
return result
@@ -196,7 +196,7 @@ def _nanquantile(
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
- _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation)
+ _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) # type: ignore[arg-type]
for (val, m) in zip(list(values), list(mask))
]
if values.dtype.kind == "f":
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 4e6f20e6ad3dd..26585e7bab8e3 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -142,18 +142,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
dt64_values = arr.view(dtype)
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
-
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
from pandas.core.arrays import TimedeltaArray
td64_values = arr.view(dtype)
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
-
- # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
- # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
- # type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
- # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
- return arr.view(dtype=dtype) # type: ignore[arg-type]
+ return arr.view(dtype=dtype)
def take(
self,
diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py
index 285c3fd465ffc..7da83e2257e30 100644
--- a/pandas/core/arrays/arrow/_arrow_utils.py
+++ b/pandas/core/arrays/arrow/_arrow_utils.py
@@ -44,7 +44,7 @@ def pyarrow_array_to_numpy_and_mask(
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
- mask = np.asarray(mask)
+ mask = np.asarray(mask) # type: ignore[assignment]
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 9295cf7873d98..0b90bcea35100 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -33,7 +33,6 @@
infer_dtype_from_scalar,
)
from pandas.core.dtypes.common import (
- CategoricalDtype,
is_array_like,
is_bool_dtype,
is_float_dtype,
@@ -730,9 +729,7 @@ def __setstate__(self, state) -> None:
def _cmp_method(self, other, op) -> ArrowExtensionArray:
pc_func = ARROW_CMP_FUNCS[op.__name__]
- if isinstance(
- other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
- ) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
+ if isinstance(other, (ExtensionArray, np.ndarray, list)):
try:
result = pc_func(self._pa_array, self._box_pa(other))
except pa.ArrowNotImplementedError:
@@ -2540,7 +2537,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
dummies_dtype = np.bool_
dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
dummies[indices] = True
- dummies = dummies.reshape((n_rows, n_cols))
+ dummies = dummies.reshape((n_rows, n_cols)) # type: ignore[assignment]
result = type(self)(pa.array(list(dummies)))
return result, uniques_sorted.to_pylist()
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index dad38abccf4ee..d0048e122051a 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -596,7 +596,7 @@ def to_numpy(
if copy or na_value is not lib.no_default:
result = result.copy()
if na_value is not lib.no_default:
- result[self.isna()] = na_value
+ result[self.isna()] = na_value # type: ignore[index]
return result
# ------------------------------------------------------------------------
@@ -941,7 +941,7 @@ def argmin(self, skipna: bool = True) -> int:
--------
>>> arr = pd.array([3, 1, 2, 5, 4])
>>> arr.argmin()
- 1
+ np.int64(1)
"""
# Implementer note: You have two places to override the behavior of
# argmin.
@@ -975,7 +975,7 @@ def argmax(self, skipna: bool = True) -> int:
--------
>>> arr = pd.array([3, 1, 2, 5, 4])
>>> arr.argmax()
- 3
+ np.int64(3)
"""
# Implementer note: You have two places to override the behavior of
# argmax.
@@ -1959,10 +1959,10 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
--------
>>> class MyExtensionArray(pd.arrays.NumpyExtensionArray):
... def _formatter(self, boxed=False):
- ... return lambda x: "*" + str(x) + "*" if boxed else repr(x) + "*"
+ ... return lambda x: "*" + str(x) + "*"
>>> MyExtensionArray(np.array([1, 2, 3, 4]))
- [1*, 2*, 3*, 4*]
+ [*1*, *2*, *3*, *4*]
Length: 4, dtype: int64
"""
if boxed:
@@ -2176,15 +2176,15 @@ def _reduce(
Examples
--------
>>> pd.array([1, 2, 3])._reduce("min")
- 1
+ np.int64(1)
>>> pd.array([1, 2, 3])._reduce("max")
- 3
+ np.int64(3)
>>> pd.array([1, 2, 3])._reduce("sum")
- 6
+ np.int64(6)
>>> pd.array([1, 2, 3])._reduce("mean")
- 2.0
+ np.float64(2.0)
>>> pd.array([1, 2, 3])._reduce("median")
- 2.0
+ np.float64(2.0)
"""
meth = getattr(self, name, None)
if meth is None:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 647530151d5f6..3d2ad109a55ba 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -452,7 +452,7 @@ def __init__(
if isinstance(values, Index):
arr = values._data._pa_array.combine_chunks()
else:
- arr = values._pa_array.combine_chunks()
+ arr = extract_array(values)._pa_array.combine_chunks()
categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype)
codes = arr.indices.to_numpy()
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
@@ -1666,7 +1666,7 @@ def __array__(
Parameters
----------
dtype : np.dtype or None
- Specifies the the dtype for the array.
+ Specifies the dtype for the array.
copy : bool or None, optional
See :func:`numpy.asarray`.
@@ -1853,7 +1853,7 @@ def value_counts(self, dropna: bool = True) -> Series:
count = np.bincount(obs, minlength=ncat or 0)
else:
count = np.bincount(np.where(mask, code, ncat))
- ix = np.append(ix, -1)
+ ix = np.append(ix, -1) # type: ignore[assignment]
ix = coerce_indexer_dtype(ix, self.dtype.categories)
ix_categorical = self._from_backing_data(ix)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index eba738c926497..994d7b1d0081c 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -275,7 +275,7 @@ def _unbox_scalar(
--------
>>> arr = pd.array(np.array(["1970-01-01"], "datetime64[ns]"))
>>> arr._unbox_scalar(arr[0])
- numpy.datetime64('1970-01-01T00:00:00.000000000')
+ np.datetime64('1970-01-01T00:00:00.000000000')
"""
raise AbstractMethodError(self)
@@ -2394,7 +2394,7 @@ def take(
)
indices = np.asarray(indices, dtype=np.intp)
- maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) # type: ignore[arg-type]
if isinstance(maybe_slice, slice):
freq = self._get_getitem_freq(maybe_slice)
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index df40c9c11b117..b31c543188282 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -331,7 +331,7 @@ def _simple_new( # type: ignore[override]
else:
# DatetimeTZDtype. If we have e.g. DatetimeTZDtype[us, UTC],
# then values.dtype should be M8[us].
- assert dtype._creso == get_unit_from_dtype(values.dtype)
+ assert dtype._creso == get_unit_from_dtype(values.dtype) # type: ignore[union-attr]
result = super()._simple_new(values, dtype)
result._freq = freq
@@ -542,7 +542,7 @@ def _unbox_scalar(self, value) -> np.datetime64:
raise ValueError("'value' should be a Timestamp.")
self._check_compatible_with(value)
if value is NaT:
- return np.datetime64(value._value, self.unit)
+ return np.datetime64(value._value, self.unit) # type: ignore[call-overload]
else:
return value.as_unit(self.unit, round_ok=False).asm8
@@ -813,10 +813,7 @@ def _add_offset(self, offset: BaseOffset) -> Self:
try:
res_values = offset._apply_array(values._ndarray)
if res_values.dtype.kind == "i":
- # error: Argument 1 to "view" of "ndarray" has incompatible type
- # "dtype[datetime64] | DatetimeTZDtype"; expected
- # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]"
- res_values = res_values.view(values.dtype) # type: ignore[arg-type]
+ res_values = res_values.view(values.dtype)
except NotImplementedError:
if get_option("performance_warnings"):
warnings.warn(
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index 0bf2089df5f85..6cb79e915c78b 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -1775,7 +1775,8 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray:
[(0, 1], (1, 2]]
Length: 2, dtype: interval[int64, right]
>>> idx.to_tuples()
- array([(0, 1), (1, 2)], dtype=object)
+ array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))],
+ dtype=object)
For :class:`pandas.IntervalIndex`:
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 708a3818bcbb7..e7a6b207363c3 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -1378,25 +1378,25 @@ def any(
skips NAs):
>>> pd.array([True, False, True]).any()
- True
+ np.True_
>>> pd.array([True, False, pd.NA]).any()
- True
+ np.True_
>>> pd.array([False, False, pd.NA]).any()
- False
+ np.False_
>>> pd.array([], dtype="boolean").any()
- False
+ np.False_
>>> pd.array([pd.NA], dtype="boolean").any()
- False
+ np.False_
>>> pd.array([pd.NA], dtype="Float64").any()
- False
+ np.False_
With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):
>>> pd.array([True, False, pd.NA]).any(skipna=False)
- True
+ np.True_
>>> pd.array([1, 0, pd.NA]).any(skipna=False)
- True
+ np.True_
>>> pd.array([False, False, pd.NA]).any(skipna=False)
>>> pd.array([0, 0, pd.NA]).any(skipna=False)
@@ -1466,17 +1466,17 @@ def all(
skips NAs):
>>> pd.array([True, True, pd.NA]).all()
- True
+ np.True_
>>> pd.array([1, 1, pd.NA]).all()
- True
+ np.True_
>>> pd.array([True, False, pd.NA]).all()
- False
+ np.False_
>>> pd.array([], dtype="boolean").all()
- True
+ np.True_
>>> pd.array([pd.NA], dtype="boolean").all()
- True
+ np.True_
>>> pd.array([pd.NA], dtype="Float64").all()
- True
+ np.True_
With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):
@@ -1486,9 +1486,9 @@ def all(
>>> pd.array([1, 1, pd.NA]).all(skipna=False)
>>> pd.array([True, False, pd.NA]).all(skipna=False)
- False
+ np.False_
>>> pd.array([1, 0, pd.NA]).all(skipna=False)
- False
+ np.False_
"""
nv.validate_all((), kwargs)
@@ -1497,10 +1497,10 @@ def all(
result = values.all(axis=axis)
if skipna:
- return result
+ return result # type: ignore[return-value]
else:
if not result or len(self) == 0 or not self._mask.any():
- return result
+ return result # type: ignore[return-value]
else:
return self.dtype.na_value
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index eab8527eef526..7dde03b30cd6a 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -297,7 +297,7 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate):
--------
>>> df = pd.DataFrame({"a": [1, 2, 0, 0], "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
>>> df.sparse.density
- 0.5
+ np.float64(0.5)
"""
def _validate(self, data) -> None:
@@ -459,7 +459,7 @@ def density(self) -> float:
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
>>> df.sparse.density
- 0.5
+ np.float64(0.5)
"""
tmp = np.mean([column.array.density for _, column in self._parent.items()])
return tmp
diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py
index cc9fd2d5fb8b0..d4ef3003583c3 100644
--- a/pandas/core/arrays/sparse/scipy_sparse.py
+++ b/pandas/core/arrays/sparse/scipy_sparse.py
@@ -79,7 +79,7 @@ def _levels_to_axis(
ax_coords = codes[valid_ilocs]
ax_labels = ax_labels.tolist()
- return ax_coords, ax_labels
+ return ax_coords, ax_labels # pyright: ignore[reportReturnType]
def _to_ijv(
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7227ea77ca433..8048306df91a2 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -123,10 +123,10 @@ class StringDtype(StorageExtensionDtype):
Examples
--------
>>> pd.StringDtype()
- string[python]
+ )>
>>> pd.StringDtype(storage="pyarrow")
- string[pyarrow]
+ )>
"""
@property
@@ -198,11 +198,8 @@ def __init__(
self._na_value = na_value
def __repr__(self) -> str:
- if self._na_value is libmissing.NA:
- return f"{self.name}[{self.storage}]"
- else:
- # TODO add more informative repr
- return self.name
+ storage = "" if self.storage == "pyarrow" else "storage='python', "
+ return f""
def __eq__(self, other: object) -> bool:
# we need to override the base class __eq__ because na_value (NA or NaN)
@@ -1018,7 +1015,30 @@ def searchsorted(
return super().searchsorted(value=value, side=side, sorter=sorter)
def _cmp_method(self, other, op):
- from pandas.arrays import BooleanArray
+ from pandas.arrays import (
+ ArrowExtensionArray,
+ BooleanArray,
+ )
+
+ if (
+ isinstance(other, BaseStringArray)
+ and self.dtype.na_value is not libmissing.NA
+ and other.dtype.na_value is libmissing.NA
+ ):
+ # NA has priority of NaN semantics
+ return NotImplemented
+
+ if isinstance(other, ArrowExtensionArray):
+ if isinstance(other, BaseStringArray):
+ # pyarrow storage has priority over python storage
+ # (except if we have NA semantics and other not)
+ if not (
+ self.dtype.na_value is libmissing.NA
+ and other.dtype.na_value is not libmissing.NA
+ ):
+ return NotImplemented
+ else:
+ return NotImplemented
if isinstance(other, StringArray):
other = other._ndarray
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index d35083fd892a8..9668981df827b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -281,7 +281,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
]
# short-circuit to return all False array.
- if not len(value_set):
+ if not value_set:
return np.zeros(len(self), dtype=bool)
result = pc.is_in(
@@ -473,6 +473,14 @@ def value_counts(self, dropna: bool = True) -> Series:
return result
def _cmp_method(self, other, op):
+ if (
+ isinstance(other, (BaseStringArray, ArrowExtensionArray))
+ and self.dtype.na_value is not libmissing.NA
+ and other.dtype.na_value is libmissing.NA
+ ):
+ # NA has priority of NaN semantics
+ return NotImplemented
+
result = super()._cmp_method(other, op)
if self.dtype.na_value is np.nan:
if op == operator.ne:
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index c5b3129c506c8..9012b9f36348a 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -325,7 +325,7 @@ def _unbox_scalar(self, value) -> np.timedelta64:
raise ValueError("'value' should be a Timedelta.")
self._check_compatible_with(value)
if value is NaT:
- return np.timedelta64(value._value, self.unit)
+ return np.timedelta64(value._value, self.unit) # type: ignore[call-overload]
else:
return value.as_unit(self.unit, round_ok=False).asm8
diff --git a/pandas/core/base.py b/pandas/core/base.py
index a64cd8633c1db..6cc28d4e46634 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -804,9 +804,9 @@ def argmax(
dtype: float64
>>> s.argmax()
- 2
+ np.int64(2)
>>> s.argmin()
- 0
+ np.int64(0)
The maximum cereal calories is the third element and
the minimum cereal calories is the first element,
@@ -1360,7 +1360,7 @@ def factorize(
dtype: int64
>>> ser.searchsorted(4)
- 3
+ np.int64(3)
>>> ser.searchsorted([0, 4])
array([0, 3])
@@ -1379,7 +1379,7 @@ def factorize(
dtype: datetime64[s]
>>> ser.searchsorted('3/14/2000')
- 3
+ np.int64(3)
>>> ser = pd.Categorical(
... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
@@ -1389,7 +1389,7 @@ def factorize(
Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']
>>> ser.searchsorted('bread')
- 1
+ np.int64(1)
>>> ser.searchsorted(['bread'], side='right')
array([3])
@@ -1480,9 +1480,9 @@ def _arith_method(self, other, op):
with np.errstate(all="ignore"):
result = ops.arithmetic_op(lvalues, rvalues, op)
- return self._construct_result(result, name=res_name)
+ return self._construct_result(result, name=res_name, other=other)
- def _construct_result(self, result, name):
+ def _construct_result(self, result, name, other):
"""
Construct an appropriately-wrapped result from the ArrayLike result
of an arithmetic-like operation.
diff --git a/pandas/core/common.py b/pandas/core/common.py
index 100ad312bd839..75f8a56aac5db 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -246,7 +246,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi
with warnings.catch_warnings():
# Can remove warning filter once NumPy 1.24 is min version
if not np_version_gte1p24:
- warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
+ # np.VisibleDeprecationWarning only in np.exceptions in 2.0
+ warnings.simplefilter("ignore", np.VisibleDeprecationWarning) # type: ignore[attr-defined]
result = np.asarray(values, dtype=dtype)
except ValueError:
# Using try/except since it's more performant than checking is_list_like
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
index 14a393b02409c..b53596fe28e70 100644
--- a/pandas/core/computation/expr.py
+++ b/pandas/core/computation/expr.py
@@ -644,7 +644,11 @@ def visit_Attribute(self, node, **kwargs):
ctx = node.ctx
if isinstance(ctx, ast.Load):
# resolve the value
- resolved = self.visit(value).value
+ visited_value = self.visit(value)
+ if hasattr(visited_value, "value"):
+ resolved = visited_value.value
+ else:
+ resolved = visited_value(self.env)
try:
v = getattr(resolved, attr)
name = self.env.add_tmp(v)
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
index 166c9d47294cd..77b7d9ad11a6c 100644
--- a/pandas/core/computation/pytables.py
+++ b/pandas/core/computation/pytables.py
@@ -239,7 +239,8 @@ def stringify(value):
if conv_val not in metadata:
result = -1
else:
- result = metadata.searchsorted(conv_val, side="left")
+ # Find the index of the first match of conv_val in metadata
+ result = np.flatnonzero(metadata == conv_val)[0]
return TermValue(result, result, "integer")
elif kind == "integer":
try:
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
index d8a42d83b6c54..428fc24cd08ac 100644
--- a/pandas/core/dtypes/base.py
+++ b/pandas/core/dtypes/base.py
@@ -45,6 +45,11 @@ class ExtensionDtype:
"""
A custom data type, to be paired with an ExtensionArray.
+ This enables support for third-party and custom dtypes within the
+ pandas ecosystem. By implementing this interface and pairing it with a custom
+ `ExtensionArray`, users can create rich data types that integrate cleanly
+ with pandas operations, such as grouping, joining, or aggregation.
+
See Also
--------
extensions.register_extension_dtype: Register an ExtensionType
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index e92f2363b69f1..68d99937f728c 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -655,24 +655,38 @@ def is_dtype_equal(source, target) -> bool:
Parameters
----------
- source : The first dtype to compare
- target : The second dtype to compare
+ source : type or str
+ The first dtype to compare.
+ target : type or str
+ The second dtype to compare.
Returns
-------
boolean
Whether or not the two dtypes are equal.
+ See Also
+ --------
+ api.types.is_categorical_dtype : Check whether the provided array or dtype
+ is of the Categorical dtype.
+ api.types.is_string_dtype : Check whether the provided array or dtype
+ is of the string dtype.
+ api.types.is_object_dtype : Check whether an array-like or dtype is of the
+ object dtype.
+
Examples
--------
+ >>> from pandas.api.types import is_dtype_equal
>>> is_dtype_equal(int, float)
False
>>> is_dtype_equal("int", int)
True
>>> is_dtype_equal(object, "category")
False
+ >>> from pandas.core.dtypes.dtypes import CategoricalDtype
>>> is_dtype_equal(CategoricalDtype(), "category")
True
+ >>> from pandas.core.dtypes.dtypes import DatetimeTZDtype
>>> is_dtype_equal(DatetimeTZDtype(tz="UTC"), "datetime64")
False
"""
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index f20ca44728664..71fe0f6e4feb0 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -428,9 +428,9 @@ def array_equivalent(
Examples
--------
>>> array_equivalent(np.array([1, 2, np.nan]), np.array([1, 2, np.nan]))
- True
+ np.True_
>>> array_equivalent(np.array([1, np.nan, 2]), np.array([1, 2, np.nan]))
- False
+ np.False_
"""
left, right = np.asarray(left), np.asarray(right)
@@ -626,7 +626,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
>>> na_value_for_dtype(np.dtype("bool"))
False
>>> na_value_for_dtype(np.dtype("datetime64[ns]"))
- numpy.datetime64('NaT')
+ np.datetime64('NaT')
"""
if isinstance(dtype, ExtensionDtype):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 8f65277f660f7..ea7c1afdd036b 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -8,13 +8,11 @@
alignment and a host of useful data manipulation methods having to do with the
labeling information
"""
-
from __future__ import annotations
import collections
from collections import abc
from collections.abc import (
- Callable,
Hashable,
Iterable,
Iterator,
@@ -22,6 +20,7 @@
Sequence,
)
import functools
+from inspect import signature
from io import StringIO
import itertools
import operator
@@ -30,6 +29,7 @@
from typing import (
TYPE_CHECKING,
Any,
+ Callable,
Literal,
cast,
overload,
@@ -39,7 +39,12 @@
import numpy as np
from numpy import ma
-from pandas._config import get_option
+from pandas._config import (
+ get_option,
+ using_copy_on_write,
+ warn_copy_on_write,
+)
+from pandas._config.config import _get_option
from pandas._libs import (
algos as libalgos,
@@ -55,17 +60,16 @@
from pandas.errors import (
ChainedAssignmentError,
InvalidIndexError,
-)
-from pandas.errors.cow import (
_chained_assignment_method_msg,
_chained_assignment_msg,
+ _chained_assignment_warning_method_msg,
+ _chained_assignment_warning_msg,
)
from pandas.util._decorators import (
Appender,
Substitution,
deprecate_nonkeyword_arguments,
doc,
- set_module,
)
from pandas.util._exceptions import (
find_stack_level,
@@ -85,6 +89,7 @@
find_common_type,
infer_dtype_from_scalar,
invalidate_string_dtypes,
+ maybe_box_native,
maybe_downcast_to_dtype,
)
from pandas.core.dtypes.common import (
@@ -124,7 +129,7 @@
ops,
roperator,
)
-from pandas.core.accessor import Accessor
+from pandas.core.accessor import CachedAccessor
from pandas.core.apply import reconstruct_and_relabel_result
from pandas.core.array_algos.take import take_2d_multi
from pandas.core.arraylike import OpsMixin
@@ -162,11 +167,15 @@
check_bool_indexer,
check_dict_or_set_indexers,
)
-from pandas.core.internals import BlockManager
+from pandas.core.internals import (
+ ArrayManager,
+ BlockManager,
+)
from pandas.core.internals.construction import (
arrays_to_mgr,
dataclasses_to_dicts,
dict_to_mgr,
+ mgr_to_mgr,
ndarray_to_mgr,
nested_data_to_arrays,
rec_array_to_mgr,
@@ -219,17 +228,15 @@
FormattersType,
Frequency,
FromDictOrient,
- HashableT,
- HashableT2,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
JoinValidate,
Level,
- ListLike,
MergeHow,
MergeValidate,
MutableMappingT,
+ NaAction,
NaPosition,
NsmallestNlargestKeep,
PythonFuncType,
@@ -243,7 +250,7 @@
SortKind,
StorageOptions,
Suffixes,
- T,
+ ToGbqIfexist,
ToStataByteorder,
ToTimestampHow,
UpdateJoin,
@@ -255,7 +262,7 @@
from pandas.core.groupby.generic import DataFrameGroupBy
from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
- from pandas.core.internals.managers import SingleBlockManager
+ from pandas.core.internals import SingleDataManager
from pandas.io.formats.style import Styler
@@ -315,8 +322,7 @@
----------%s
right : DataFrame or named Series
Object to merge with.
-how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
- default 'inner'
+how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
Type of merge to be performed.
* left: use only keys from left frame, similar to a SQL left outer join;
@@ -329,10 +335,6 @@
join; preserve the order of the left keys.
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
- * left_anti: use only keys from left frame that are not in right frame, similar
- to SQL left anti join; preserve key order.
- * right_anti: use only keys from right frame that are not in left frame, similar
- to SQL right anti join; preserve key order.
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
@@ -362,7 +364,7 @@
of a string to indicate that the column name from `left` or
`right` should be left as-is, with no suffix. At least one of the
values must not be None.
-copy : bool, default False
+copy : bool, default True
If False, avoid copy if possible.
.. note::
@@ -376,8 +378,6 @@
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
-
- .. deprecated:: 3.0.0
indicator : bool or str, default False
If True, adds a column to the output DataFrame called "_merge" with
information on the source of each row. The column can be given a different
@@ -506,7 +506,6 @@
# DataFrame class
-@set_module("pandas")
class DataFrame(NDFrame, OpsMixin):
"""
Two-dimensional, size-mutable, potentially heterogeneous tabular data.
@@ -536,7 +535,6 @@ class DataFrame(NDFrame, OpsMixin):
will perform column selection instead.
dtype : dtype, default None
Data type to force. Only a single dtype is allowed. If None, infer.
- If ``data`` is DataFrame then is ignored.
copy : bool or None, default None
Copy data from inputs.
For dict data, the default of None behaves like ``copy=True``. For DataFrame
@@ -562,7 +560,7 @@ class DataFrame(NDFrame, OpsMixin):
--------
Constructing DataFrame from a dictionary.
- >>> d = {"col1": [1, 2], "col2": [3, 4]}
+ >>> d = {'col1': [1, 2], 'col2': [3, 4]}
>>> df = pd.DataFrame(data=d)
>>> df
col1 col2
@@ -586,7 +584,7 @@ class DataFrame(NDFrame, OpsMixin):
Constructing DataFrame from a dictionary including Series:
- >>> d = {"col1": [0, 1, 2, 3], "col2": pd.Series([2, 3], index=[2, 3])}
+ >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
>>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
col1 col2
0 0 NaN
@@ -596,9 +594,8 @@ class DataFrame(NDFrame, OpsMixin):
Constructing DataFrame from numpy ndarray:
- >>> df2 = pd.DataFrame(
- ... np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"]
- ... )
+ >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
+ ... columns=['a', 'b', 'c'])
>>> df2
a b c
0 1 2 3
@@ -607,11 +604,10 @@ class DataFrame(NDFrame, OpsMixin):
Constructing DataFrame from a numpy ndarray that has labeled columns:
- >>> data = np.array(
- ... [(1, 2, 3), (4, 5, 6), (7, 8, 9)],
- ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")],
- ... )
- >>> df3 = pd.DataFrame(data, columns=["c", "a"])
+ >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
+ ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
+ >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
+ ...
>>> df3
c a
0 3 1
@@ -650,14 +646,14 @@ class DataFrame(NDFrame, OpsMixin):
_HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
_accessors: set[str] = {"sparse"}
_hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
- _mgr: BlockManager
+ _mgr: BlockManager | ArrayManager
# similar to __array_priority__, positions DataFrame before Series, Index,
# and ExtensionArray. Should NOT be overridden by subclasses.
__pandas_priority__ = 4000
@property
- def _constructor(self) -> type[DataFrame]:
+ def _constructor(self) -> Callable[..., DataFrame]:
return DataFrame
def _constructor_from_mgr(self, mgr, axes) -> DataFrame:
@@ -715,7 +711,7 @@ def __init__(
# to avoid the result sharing the same Manager
data = data.copy(deep=False)
- if isinstance(data, BlockManager):
+ if isinstance(data, (BlockManager, ArrayManager)):
if not allow_mgr:
# GH#52419
warnings.warn(
@@ -723,10 +719,11 @@ def __init__(
"is deprecated and will raise in a future version. "
"Use public APIs instead.",
DeprecationWarning,
- stacklevel=2,
+ stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix
)
- data = data.copy(deep=False)
+ if using_copy_on_write():
+ data = data.copy(deep=False)
# first check if a Manager is passed without any other arguments
# -> use fastpath (without checking Manager type)
if index is None and columns is None and dtype is None and not copy:
@@ -734,6 +731,12 @@ def __init__(
NDFrame.__init__(self, data)
return
+ manager = _get_option("mode.data_manager", silent=True)
+
+ is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
+ data_dtype = getattr(data, "dtype", None)
+ original_dtype = dtype
+
# GH47215
if isinstance(index, set):
raise ValueError("index cannot be a set")
@@ -744,7 +747,17 @@ def __init__(
if isinstance(data, dict):
# retain pre-GH#38939 default behavior
copy = True
- elif not isinstance(data, (Index, DataFrame, Series)):
+ elif (
+ manager == "array"
+ and isinstance(data, (np.ndarray, ExtensionArray))
+ and data.ndim == 2
+ ):
+ # INFO(ArrayManager) by default copy the 2D input array to get
+ # contiguous 1D arrays
+ copy = True
+ elif using_copy_on_write() and not isinstance(
+ data, (Index, DataFrame, Series)
+ ):
copy = True
else:
copy = False
@@ -755,14 +768,14 @@ def __init__(
dtype = dtype if dtype is not None else pandas_dtype(object)
data = []
- if isinstance(data, BlockManager):
+ if isinstance(data, (BlockManager, ArrayManager)):
mgr = self._init_mgr(
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
)
elif isinstance(data, dict):
# GH#38939 de facto copy defaults to False only in non-dict cases
- mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy)
+ mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
elif isinstance(data, ma.MaskedArray):
from numpy.ma import mrecords
@@ -782,6 +795,7 @@ def __init__(
columns,
dtype=dtype,
copy=copy,
+ typ=manager,
)
elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
@@ -794,9 +808,11 @@ def __init__(
columns,
dtype,
copy,
+ typ=manager,
)
elif getattr(data, "name", None) is not None:
# i.e. Series/Index with non-None name
+ _copy = copy if using_copy_on_write() else True
mgr = dict_to_mgr(
# error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
# attribute "name"
@@ -804,7 +820,8 @@ def __init__(
index,
columns,
dtype=dtype,
- copy=copy,
+ typ=manager,
+ copy=_copy,
)
else:
mgr = ndarray_to_mgr(
@@ -813,6 +830,7 @@ def __init__(
columns,
dtype=dtype,
copy=copy,
+ typ=manager,
)
# For data is list-like, or Iterable (will consume into list)
@@ -843,6 +861,7 @@ def __init__(
columns,
index,
dtype=dtype,
+ typ=manager,
)
else:
mgr = ndarray_to_mgr(
@@ -851,6 +870,7 @@ def __init__(
columns,
dtype=dtype,
copy=copy,
+ typ=manager,
)
else:
mgr = dict_to_mgr(
@@ -858,6 +878,7 @@ def __init__(
index,
columns if columns is not None else default_index(0),
dtype=dtype,
+ typ=manager,
)
# For data is scalar
else:
@@ -878,7 +899,7 @@ def __init__(
construct_1d_arraylike_from_scalar(data, len(index), dtype)
for _ in range(len(columns))
]
- mgr = arrays_to_mgr(values, columns, index, dtype=None)
+ mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
else:
arr2d = construct_2d_arraylike_from_scalar(
data,
@@ -894,10 +915,26 @@ def __init__(
columns,
dtype=arr2d.dtype,
copy=False,
+ typ=manager,
)
+ # ensure correct Manager type according to settings
+ mgr = mgr_to_mgr(mgr, typ=manager)
+
NDFrame.__init__(self, mgr)
+ if original_dtype is None and is_pandas_object and data_dtype == np.object_:
+ if self.dtypes.iloc[0] != data_dtype:
+ warnings.warn(
+ "Dtype inference on a pandas object "
+ "(Series, Index, ExtensionArray) is deprecated. The DataFrame "
+ "constructor will keep the original dtype in the future. "
+ "Call `infer_objects` on the result to get the old "
+ "behavior.",
+ FutureWarning,
+ stacklevel=2,
+ )
+
# ----------------------------------------------------------------------
def __dataframe__(
@@ -906,19 +943,6 @@ def __dataframe__(
"""
Return the dataframe interchange object implementing the interchange protocol.
- .. note::
-
- For new development, we highly recommend using the Arrow C Data Interface
- alongside the Arrow PyCapsule Interface instead of the interchange protocol
-
- .. warning::
-
- Due to severe implementation issues, we recommend only considering using the
- interchange protocol in the following cases:
-
- - converting to pandas: for pandas >= 2.0.3
- - converting from pandas: for pandas >= 3.0.0
-
Parameters
----------
nan_as_null : bool, default False
@@ -933,11 +957,6 @@ def __dataframe__(
DataFrame interchange object
The object which consuming library can use to ingress the dataframe.
- See Also
- --------
- DataFrame.from_records : Constructor from tuples, also record arrays.
- DataFrame.from_dict : From dicts of Series, arrays, or dicts.
-
Notes
-----
Details on the interchange protocol:
@@ -945,13 +964,12 @@ def __dataframe__(
Examples
--------
- >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+ >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> interchange_object = df_not_necessarily_pandas.__dataframe__()
>>> interchange_object.column_names()
Index(['A', 'B'], dtype='object')
- >>> df_pandas = pd.api.interchange.from_dataframe(
- ... interchange_object.select_columns_by_name(["A"])
- ... )
+ >>> df_pandas = (pd.api.interchange.from_dataframe
+ ... (interchange_object.select_columns_by_name(['A'])))
>>> df_pandas
A
0 1
@@ -965,6 +983,21 @@ def __dataframe__(
return PandasDataFrameXchg(self, allow_copy=allow_copy)
+ def __dataframe_consortium_standard__(
+ self, *, api_version: str | None = None
+ ) -> Any:
+ """
+ Provide entry point to the Consortium DataFrame Standard API.
+
+ This is developed and maintained outside of pandas.
+ Please report any issues to https://github.com/data-apis/dataframe-api-compat.
+ """
+ dataframe_api_compat = import_optional_dependency("dataframe_api_compat")
+ convert_to_standard_compliant_dataframe = (
+ dataframe_api_compat.pandas_standard.convert_to_standard_compliant_dataframe
+ )
+ return convert_to_standard_compliant_dataframe(self, api_version=api_version)
+
def __arrow_c_stream__(self, requested_schema=None):
"""
Export the pandas DataFrame as an Arrow C stream PyCapsule.
@@ -1002,14 +1035,9 @@ def axes(self) -> list[Index]:
It has the row axis labels and column axis labels as the only members.
They are returned in that order.
- See Also
- --------
- DataFrame.index: The index (row labels) of the DataFrame.
- DataFrame.columns: The column labels of the DataFrame.
-
Examples
--------
- >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.axes
[RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
dtype='object')]
@@ -1021,21 +1049,18 @@ def shape(self) -> tuple[int, int]:
"""
Return a tuple representing the dimensionality of the DataFrame.
- Unlike the `len()` method, which only returns the number of rows, `shape`
- provides both row and column counts, making it a more informative method for
- understanding dataset size.
-
See Also
--------
- numpy.ndarray.shape : Tuple of array dimensions.
+ ndarray.shape : Tuple of array dimensions.
Examples
--------
- >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.shape
(2, 2)
- >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]})
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
+ ... 'col3': [5, 6]})
>>> df.shape
(2, 3)
"""
@@ -1060,22 +1085,21 @@ def _is_homogeneous_type(self) -> bool:
Items with the same type but different sizes are considered
different types.
- >>> DataFrame(
- ... {
- ... "A": np.array([1, 2], dtype=np.int32),
- ... "B": np.array([1, 2], dtype=np.int64),
- ... }
- ... )._is_homogeneous_type
+ >>> DataFrame({
+ ... "A": np.array([1, 2], dtype=np.int32),
+ ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
False
"""
# The "<" part of "<=" here is for empty DataFrame cases
- return len({block.values.dtype for block in self._mgr.blocks}) <= 1
+ return len({arr.dtype for arr in self._mgr.arrays}) <= 1
@property
def _can_fast_transpose(self) -> bool:
"""
Can we transpose this DataFrame without creating any new array objects.
"""
+ if isinstance(self._mgr, ArrayManager):
+ return False
blocks = self._mgr.blocks
if len(blocks) != 1:
return False
@@ -1091,6 +1115,13 @@ def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
"""
mgr = self._mgr
+ if isinstance(mgr, ArrayManager):
+ if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
+ # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
+ # has no attribute "reshape"
+ return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
+ return ensure_wrapped_if_datetimelike(self.values)
+
blocks = mgr.blocks
if len(blocks) != 1:
return ensure_wrapped_if_datetimelike(self.values)
@@ -1201,7 +1232,6 @@ def _repr_html_(self) -> str | None:
min_rows = get_option("display.min_rows")
max_cols = get_option("display.max_columns")
show_dimensions = get_option("display.show_dimensions")
- show_floats = get_option("display.float_format")
formatter = fmt.DataFrameFormatter(
self,
@@ -1209,7 +1239,7 @@ def _repr_html_(self) -> str | None:
col_space=None,
na_rep="NaN",
formatters=None,
- float_format=show_floats,
+ float_format=None,
sparsify=None,
justify=None,
index_names=True,
@@ -1231,7 +1261,6 @@ def _repr_html_(self) -> str | None:
def to_string(
self,
buf: None = ...,
- *,
columns: Axes | None = ...,
col_space: int | list[int] | dict[Hashable, int] | None = ...,
header: bool | SequenceNotStr[str] = ...,
@@ -1250,13 +1279,13 @@ def to_string(
min_rows: int | None = ...,
max_colwidth: int | None = ...,
encoding: str | None = ...,
- ) -> str: ...
+ ) -> str:
+ ...
@overload
def to_string(
self,
buf: FilePath | WriteBuffer[str],
- *,
columns: Axes | None = ...,
col_space: int | list[int] | dict[Hashable, int] | None = ...,
header: bool | SequenceNotStr[str] = ...,
@@ -1275,8 +1304,12 @@ def to_string(
min_rows: int | None = ...,
max_colwidth: int | None = ...,
encoding: str | None = ...,
- ) -> None: ...
+ ) -> None:
+ ...
+ @deprecate_nonkeyword_arguments(
+ version="3.0", allowed_args=["self", "buf"], name="to_string"
+ )
@Substitution(
header_type="bool or list of str",
header="Write out the column names. If a list of columns "
@@ -1291,7 +1324,6 @@ def to_string(
def to_string(
self,
buf: FilePath | WriteBuffer[str] | None = None,
- *,
columns: Axes | None = None,
col_space: int | list[int] | dict[Hashable, int] | None = None,
header: bool | SequenceNotStr[str] = True,
@@ -1330,7 +1362,7 @@ def to_string(
Examples
--------
- >>> d = {"col1": [1, 2, 3], "col2": [4, 5, 6]}
+ >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
>>> df = pd.DataFrame(d)
>>> print(df.to_string())
col1 col2
@@ -1373,7 +1405,7 @@ def _get_values_for_csv(
decimal: str,
na_rep: str,
quoting, # int csv.QUOTE_FOO from stdlib
- ) -> DataFrame:
+ ) -> Self:
# helper used by to_csv
mgr = self._mgr.get_values_for_csv(
float_format=float_format,
@@ -1382,7 +1414,8 @@ def _get_values_for_csv(
na_rep=na_rep,
quoting=quoting,
)
- return self._constructor_from_mgr(mgr, axes=mgr.axes)
+ # error: Incompatible return value type (got "DataFrame", expected "Self")
+ return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value]
# ----------------------------------------------------------------------
@@ -1400,7 +1433,7 @@ def style(self) -> Styler:
Examples
--------
- >>> df = pd.DataFrame({"A": [1, 2, 3]})
+ >>> df = pd.DataFrame({'A': [1, 2, 3]})
>>> df.style # doctest: +SKIP
Please see
@@ -1415,7 +1448,9 @@ def style(self) -> Styler:
return Styler(self)
- _shared_docs["items"] = r"""
+ _shared_docs[
+ "items"
+ ] = r"""
Iterate over (column name, Series) pairs.
Iterates over the DataFrame columns, returning a tuple with
@@ -1465,8 +1500,12 @@ def style(self) -> Styler:
@Appender(_shared_docs["items"])
def items(self) -> Iterable[tuple[Hashable, Series]]:
- for i, k in enumerate(self.columns):
- yield k, self._ixs(i, axis=1)
+ if self.columns.is_unique and hasattr(self, "_item_cache"):
+ for k in self.columns:
+ yield k, self._get_item_cache(k)
+ else:
+ for i, k in enumerate(self.columns):
+ yield k, self._ixs(i, axis=1)
def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
"""
@@ -1502,23 +1541,24 @@ def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
Examples
--------
- >>> df = pd.DataFrame([[1, 1.5]], columns=["int", "float"])
+ >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
>>> row = next(df.iterrows())[1]
>>> row
int 1.0
float 1.5
Name: 0, dtype: float64
- >>> print(row["int"].dtype)
+ >>> print(row['int'].dtype)
float64
- >>> print(df["int"].dtype)
+ >>> print(df['int'].dtype)
int64
"""
columns = self.columns
klass = self._constructor_sliced
+ using_cow = using_copy_on_write()
for k, v in zip(self.index, self.values):
s = klass(v, index=columns, name=k).__finalize__(self)
- if self._mgr.is_single_block:
- s._mgr.add_references(self._mgr)
+ if using_cow and self._mgr.is_single_block:
+ s._mgr.add_references(self._mgr) # type: ignore[arg-type]
yield k, s
def itertuples(
@@ -1555,15 +1595,15 @@ def itertuples(
Examples
--------
- >>> df = pd.DataFrame(
- ... {"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"]
- ... )
+ >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
+ ... index=['dog', 'hawk'])
>>> df
num_legs num_wings
dog 4 0
hawk 2 2
>>> for row in df.itertuples():
... print(row)
+ ...
Pandas(Index='dog', num_legs=4, num_wings=0)
Pandas(Index='hawk', num_legs=2, num_wings=2)
@@ -1572,14 +1612,16 @@ def itertuples(
>>> for row in df.itertuples(index=False):
... print(row)
+ ...
Pandas(num_legs=4, num_wings=0)
Pandas(num_legs=2, num_wings=2)
With the `name` parameter set we set a custom name for the yielded
namedtuples:
- >>> for row in df.itertuples(name="Animal"):
+ >>> for row in df.itertuples(name='Animal'):
... print(row)
+ ...
Animal(Index='dog', num_legs=4, num_wings=0)
Animal(Index='hawk', num_legs=2, num_wings=2)
"""
@@ -1610,10 +1652,12 @@ def __len__(self) -> int:
return len(self.index)
@overload
- def dot(self, other: Series) -> Series: ...
+ def dot(self, other: Series) -> Series:
+ ...
@overload
- def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: ...
+ def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
+ ...
def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
"""
@@ -1697,8 +1741,8 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
if len(common) > len(self.columns) or len(common) > len(other.index):
raise ValueError("matrices are not aligned")
- left = self.reindex(columns=common)
- right = other.reindex(index=common)
+ left = self.reindex(columns=common, copy=False)
+ right = other.reindex(index=common, copy=False)
lvals = left.values
rvals = right._values
else:
@@ -1734,10 +1778,12 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
raise TypeError(f"unsupported type: {type(other)}")
@overload
- def __matmul__(self, other: Series) -> Series: ...
+ def __matmul__(self, other: Series) -> Series:
+ ...
@overload
- def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: ...
+ def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
+ ...
def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
"""
@@ -1810,7 +1856,7 @@ def from_dict(
--------
By default the keys of the dict become the DataFrame columns:
- >>> data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
+ >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
>>> pd.DataFrame.from_dict(data)
col_1 col_2
0 3 a
@@ -1821,8 +1867,8 @@ def from_dict(
Specify ``orient='index'`` to create the DataFrame using dictionary
keys as rows:
- >>> data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]}
- >>> pd.DataFrame.from_dict(data, orient="index")
+ >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
+ >>> pd.DataFrame.from_dict(data, orient='index')
0 1 2 3
row_1 3 2 1 0
row_2 a b c d
@@ -1830,7 +1876,8 @@ def from_dict(
When using the 'index' orientation, the column names can be
specified manually:
- >>> pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
+ >>> pd.DataFrame.from_dict(data, orient='index',
+ ... columns=['A', 'B', 'C', 'D'])
A B C D
row_1 3 2 1 0
row_2 a b c d
@@ -1838,21 +1885,19 @@ def from_dict(
Specify ``orient='tight'`` to create the DataFrame using a 'tight'
format:
- >>> data = {
- ... "index": [("a", "b"), ("a", "c")],
- ... "columns": [("x", 1), ("y", 2)],
- ... "data": [[1, 3], [2, 4]],
- ... "index_names": ["n1", "n2"],
- ... "column_names": ["z1", "z2"],
- ... }
- >>> pd.DataFrame.from_dict(data, orient="tight")
+ >>> data = {'index': [('a', 'b'), ('a', 'c')],
+ ... 'columns': [('x', 1), ('y', 2)],
+ ... 'data': [[1, 3], [2, 4]],
+ ... 'index_names': ['n1', 'n2'],
+ ... 'column_names': ['z1', 'z2']}
+ >>> pd.DataFrame.from_dict(data, orient='tight')
z1 x y
z2 1 2
n1 n2
a b 1 3
c 2 4
"""
- index: list | Index | None = None
+ index = None
orient = orient.lower() # type: ignore[assignment]
if orient == "index":
if len(data) > 0:
@@ -1878,7 +1923,7 @@ def from_dict(
else:
realdata = data["data"]
- def create_index(indexlist, namelist) -> Index:
+ def create_index(indexlist, namelist):
index: Index
if len(namelist) > 1:
index = MultiIndex.from_tuples(indexlist, names=namelist)
@@ -1921,7 +1966,6 @@ def to_numpy(
Returns
-------
numpy.ndarray
- The NumPy array representing the values in the DataFrame.
See Also
--------
@@ -1944,7 +1988,7 @@ def to_numpy(
For a mix of numeric and non-numeric types, the output array will
have object dtype.
- >>> df["C"] = pd.date_range("2000", periods=2)
+ >>> df['C'] = pd.date_range('2000', periods=2)
>>> df.to_numpy()
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
@@ -1957,6 +2001,28 @@ def to_numpy(
return result
+ def _create_data_for_split_and_tight_to_dict(
+ self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
+ ) -> list:
+ """
+ Simple helper method to create data for to ``to_dict(orient="split")`` and
+ ``to_dict(orient="tight")`` to create the main output data
+ """
+ if are_all_object_dtype_cols:
+ data = [
+ list(map(maybe_box_native, t))
+ for t in self.itertuples(index=False, name=None)
+ ]
+ else:
+ data = [list(t) for t in self.itertuples(index=False, name=None)]
+ if object_dtype_indices:
+ # If we have object_dtype_cols, apply maybe_box_naive after list
+ # comprehension for perf
+ for row in data:
+ for i in object_dtype_indices:
+ row[i] = maybe_box_native(row[i])
+ return data
+
@overload
def to_dict(
self,
@@ -1964,7 +2030,8 @@ def to_dict(
*,
into: type[MutableMappingT] | MutableMappingT,
index: bool = ...,
- ) -> MutableMappingT: ...
+ ) -> MutableMappingT:
+ ...
@overload
def to_dict(
@@ -1973,7 +2040,8 @@ def to_dict(
*,
into: type[MutableMappingT] | MutableMappingT,
index: bool = ...,
- ) -> list[MutableMappingT]: ...
+ ) -> list[MutableMappingT]:
+ ...
@overload
def to_dict(
@@ -1982,7 +2050,8 @@ def to_dict(
*,
into: type[dict] = ...,
index: bool = ...,
- ) -> dict: ...
+ ) -> dict:
+ ...
@overload
def to_dict(
@@ -1991,17 +2060,21 @@ def to_dict(
*,
into: type[dict] = ...,
index: bool = ...,
- ) -> list[dict]: ...
+ ) -> list[dict]:
+ ...
# error: Incompatible default for argument "into" (default has type "type
# [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT")
+ @deprecate_nonkeyword_arguments(
+ version="3.0", allowed_args=["self", "orient"], name="to_dict"
+ )
def to_dict(
self,
orient: Literal[
"dict", "list", "series", "split", "tight", "records", "index"
] = "dict",
- *,
- into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment]
+ into: type[MutableMappingT]
+ | MutableMappingT = dict, # type: ignore[assignment]
index: bool = True,
) -> MutableMappingT | list[MutableMappingT]:
"""
@@ -2039,9 +2112,7 @@ def to_dict(
index : bool, default True
Whether to include the index item (and index_names item if `orient`
is 'tight') in the returned dictionary. Can only be ``False``
- when `orient` is 'split' or 'tight'. Note that when `orient` is
- 'records', this parameter does not take effect (index item always
- not included).
+ when `orient` is 'split' or 'tight'.
.. versionadded:: 2.0.0
@@ -2059,9 +2130,9 @@ def to_dict(
Examples
--------
- >>> df = pd.DataFrame(
- ... {"col1": [1, 2], "col2": [0.5, 0.75]}, index=["row1", "row2"]
- ... )
+ >>> df = pd.DataFrame({'col1': [1, 2],
+ ... 'col2': [0.5, 0.75]},
+ ... index=['row1', 'row2'])
>>> df
col1 col2
row1 1 0.50
@@ -2071,7 +2142,7 @@ def to_dict(
You can specify the return orientation.
- >>> df.to_dict("series")
+ >>> df.to_dict('series')
{'col1': row1 1
row2 2
Name: col1, dtype: int64,
@@ -2079,17 +2150,17 @@ def to_dict(
row2 0.75
Name: col2, dtype: float64}
- >>> df.to_dict("split")
+ >>> df.to_dict('split')
{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
'data': [[1, 0.5], [2, 0.75]]}
- >>> df.to_dict("records")
+ >>> df.to_dict('records')
[{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
- >>> df.to_dict("index")
+ >>> df.to_dict('index')
{'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
- >>> df.to_dict("tight")
+ >>> df.to_dict('tight')
{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
@@ -2103,7 +2174,7 @@ def to_dict(
If you want a `defaultdict`, you need to initialize it:
>>> dd = defaultdict(list)
- >>> df.to_dict("records", into=dd)
+ >>> df.to_dict('records', into=dd)
[defaultdict(, {'col1': 1, 'col2': 0.5}),
defaultdict(, {'col1': 2, 'col2': 0.75})]
"""
@@ -2111,6 +2182,144 @@ def to_dict(
return to_dict(self, orient, into=into, index=index)
+ @deprecate_nonkeyword_arguments(
+ version="3.0", allowed_args=["self", "destination_table"], name="to_gbq"
+ )
+ def to_gbq(
+ self,
+ destination_table: str,
+ project_id: str | None = None,
+ chunksize: int | None = None,
+ reauth: bool = False,
+ if_exists: ToGbqIfexist = "fail",
+ auth_local_webserver: bool = True,
+ table_schema: list[dict[str, str]] | None = None,
+ location: str | None = None,
+ progress_bar: bool = True,
+ credentials=None,
+ ) -> None:
+ """
+ Write a DataFrame to a Google BigQuery table.
+
+ .. deprecated:: 2.2.0
+
+ Please use ``pandas_gbq.to_gbq`` instead.
+
+ This function requires the `pandas-gbq package
+ `__.
+
+ See the `How to authenticate with Google BigQuery
+ `__
+ guide for authentication instructions.
+
+ Parameters
+ ----------
+ destination_table : str
+ Name of table to be written, in the form ``dataset.tablename``.
+ project_id : str, optional
+ Google BigQuery Account project ID. Optional when available from
+ the environment.
+ chunksize : int, optional
+ Number of rows to be inserted in each chunk from the dataframe.
+ Set to ``None`` to load the whole dataframe at once.
+ reauth : bool, default False
+ Force Google BigQuery to re-authenticate the user. This is useful
+ if multiple accounts are used.
+ if_exists : str, default 'fail'
+ Behavior when the destination table exists. Value can be one of:
+
+ ``'fail'``
+ If table exists raise pandas_gbq.gbq.TableCreationError.
+ ``'replace'``
+ If table exists, drop it, recreate it, and insert data.
+ ``'append'``
+ If table exists, insert data. Create if does not exist.
+ auth_local_webserver : bool, default True
+ Use the `local webserver flow`_ instead of the `console flow`_
+ when getting user credentials.
+
+ .. _local webserver flow:
+ https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
+ .. _console flow:
+ https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
+
+ *New in version 0.2.0 of pandas-gbq*.
+
+ .. versionchanged:: 1.5.0
+ Default value is changed to ``True``. Google has deprecated the
+ ``auth_local_webserver = False`` `"out of band" (copy-paste)
+ flow
+ `_.
+ table_schema : list of dicts, optional
+ List of BigQuery table fields to which according DataFrame
+ columns conform to, e.g. ``[{'name': 'col1', 'type':
+ 'STRING'},...]``. If schema is not provided, it will be
+ generated according to dtypes of DataFrame columns. See
+ BigQuery API documentation on available names of a field.
+
+ *New in version 0.3.1 of pandas-gbq*.
+ location : str, optional
+ Location where the load job should run. See the `BigQuery locations
+ documentation
+ `__ for a
+ list of available locations. The location must match that of the
+ target dataset.
+
+ *New in version 0.5.0 of pandas-gbq*.
+ progress_bar : bool, default True
+ Use the library `tqdm` to show the progress bar for the upload,
+ chunk by chunk.
+
+ *New in version 0.5.0 of pandas-gbq*.
+ credentials : google.auth.credentials.Credentials, optional
+ Credentials for accessing Google APIs. Use this parameter to
+ override default credentials, such as to use Compute Engine
+ :class:`google.auth.compute_engine.Credentials` or Service
+ Account :class:`google.oauth2.service_account.Credentials`
+ directly.
+
+ *New in version 0.8.0 of pandas-gbq*.
+
+ See Also
+ --------
+ pandas_gbq.to_gbq : This function in the pandas-gbq library.
+ read_gbq : Read a DataFrame from Google BigQuery.
+
+ Examples
+ --------
+ Example taken from `Google BigQuery documentation
+ `_
+
+ >>> project_id = "my-project"
+ >>> table_id = 'my_dataset.my_table'
+ >>> df = pd.DataFrame({
+ ... "my_string": ["a", "b", "c"],
+ ... "my_int64": [1, 2, 3],
+ ... "my_float64": [4.0, 5.0, 6.0],
+ ... "my_bool1": [True, False, True],
+ ... "my_bool2": [False, True, False],
+ ... "my_dates": pd.date_range("now", periods=3),
+ ... }
+ ... )
+
+ >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP
+ """
+ from pandas.io import gbq
+
+ gbq.to_gbq(
+ self,
+ destination_table,
+ project_id=project_id,
+ chunksize=chunksize,
+ reauth=reauth,
+ if_exists=if_exists,
+ auth_local_webserver=auth_local_webserver,
+ table_schema=table_schema,
+ location=location,
+ progress_bar=progress_bar,
+ credentials=credentials,
+ )
+
@classmethod
def from_records(
cls,
@@ -2124,13 +2333,16 @@ def from_records(
"""
Convert structured or record ndarray to DataFrame.
- Creates a DataFrame object from a structured ndarray, or sequence of
- tuples or dicts.
+ Creates a DataFrame object from a structured ndarray, sequence of
+ tuples or dicts, or DataFrame.
Parameters
----------
- data : structured ndarray, sequence of tuples or dicts
+ data : structured ndarray, sequence of tuples or dicts, or DataFrame
Structured input data.
+
+ .. deprecated:: 2.1.0
+ Passing a DataFrame is deprecated.
index : str, list of fields, array-like
Field of array to use as the index, alternately a specific set of
input labels to use.
@@ -2139,10 +2351,9 @@ def from_records(
columns : sequence, default None
Column names to use. If the passed data do not have names
associated with them, this argument provides names for the
- columns. Otherwise, this argument indicates the order of the columns
+ columns. Otherwise this argument indicates the order of the columns
in the result (any names not found in the data will become all-NA
- columns) and limits the data to these columns if not all column names
- are provided.
+ columns).
coerce_float : bool, default False
Attempt to convert values of non-string, non-numeric objects (like
decimal.Decimal) to floating point, useful for SQL result sets.
@@ -2162,10 +2373,8 @@ def from_records(
--------
Data can be provided as a structured ndarray:
- >>> data = np.array(
- ... [(3, "a"), (2, "b"), (1, "c"), (0, "d")],
- ... dtype=[("col_1", "i4"), ("col_2", "U1")],
- ... )
+ >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
+ ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
>>> pd.DataFrame.from_records(data)
col_1 col_2
0 3 a
@@ -2175,12 +2384,10 @@ def from_records(
Data can be provided as a list of dicts:
- >>> data = [
- ... {"col_1": 3, "col_2": "a"},
- ... {"col_1": 2, "col_2": "b"},
- ... {"col_1": 1, "col_2": "c"},
- ... {"col_1": 0, "col_2": "d"},
- ... ]
+ >>> data = [{'col_1': 3, 'col_2': 'a'},
+ ... {'col_1': 2, 'col_2': 'b'},
+ ... {'col_1': 1, 'col_2': 'c'},
+ ... {'col_1': 0, 'col_2': 'd'}]
>>> pd.DataFrame.from_records(data)
col_1 col_2
0 3 a
@@ -2190,8 +2397,8 @@ def from_records(
Data can be provided as a list of tuples with corresponding columns:
- >>> data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")]
- >>> pd.DataFrame.from_records(data, columns=["col_1", "col_2"])
+ >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
+ >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
col_1 col_2
0 3 a
1 2 b
@@ -2199,10 +2406,21 @@ def from_records(
3 0 d
"""
if isinstance(data, DataFrame):
- raise TypeError(
- "Passing a DataFrame to DataFrame.from_records is not supported. Use "
+ warnings.warn(
+ "Passing a DataFrame to DataFrame.from_records is deprecated. Use "
"set_index and/or drop to modify the DataFrame instead.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
)
+ if columns is not None:
+ if is_scalar(columns):
+ columns = [columns]
+ data = data[columns]
+ if index is not None:
+ data = data.set_index(index)
+ if exclude is not None:
+ data = data.drop(columns=exclude)
+ return data.copy(deep=False)
result_index = None
@@ -2215,7 +2433,7 @@ def maybe_reorder(
) -> tuple[list[ArrayLike], Index, Index | None]:
"""
If our desired 'columns' do not match the data's pre-existing 'arr_columns',
- we re-order our arrays. This is like a preemptive (cheap) reindex.
+ we re-order our arrays. This is like a pre-emptive (cheap) reindex.
"""
if len(arrays):
length = len(arrays[0])
@@ -2319,17 +2537,16 @@ def maybe_reorder(
exclude.update(index)
if any(exclude):
- arr_exclude = (x for x in exclude if x in arr_columns)
- to_remove = {arr_columns.get_loc(col) for col in arr_exclude} # pyright: ignore[reportUnhashable]
+ arr_exclude = [x for x in exclude if x in arr_columns]
+ to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
columns = columns.drop(exclude)
- mgr = arrays_to_mgr(arrays, columns, result_index)
- df = DataFrame._from_mgr(mgr, axes=mgr.axes)
- if cls is not DataFrame:
- return cls(df, copy=False)
- return df
+ manager = _get_option("mode.data_manager", silent=True)
+ mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
+
+ return cls._from_mgr(mgr, axes=mgr.axes)
def to_records(
self, index: bool = True, column_dtypes=None, index_dtypes=None
@@ -2372,7 +2589,8 @@ def to_records(
Examples
--------
- >>> df = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"])
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
+ ... index=['a', 'b'])
>>> df
A B
a 1 0.50
@@ -2526,6 +2744,7 @@ def _from_arrays(
if dtype is not None:
dtype = pandas_dtype(dtype)
+ manager = _get_option("mode.data_manager", silent=True)
columns = ensure_index(columns)
if len(columns) != len(arrays):
raise ValueError("len(columns) must match len(arrays)")
@@ -2535,6 +2754,7 @@ def _from_arrays(
index,
dtype=dtype,
verify_integrity=verify_integrity,
+ typ=manager,
)
return cls._from_mgr(mgr, axes=mgr.axes)
@@ -2643,10 +2863,10 @@ def to_stata(
Examples
--------
- >>> df = pd.DataFrame(
- ... [["falcon", 350], ["parrot", 18]], columns=["animal", "parrot"]
- ... )
- >>> df.to_stata("animals.dta") # doctest: +SKIP
+ >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
+ ... 'parrot'],
+ ... 'speed': [350, 18, 361, 15]}})
+ >>> df.to_stata('animals.dta') # doctest: +SKIP
"""
if version not in (114, 117, 118, 119, None):
raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
@@ -2706,16 +2926,6 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
This includes the `compression`, `compression_level`, `chunksize`
and `version` keywords.
- See Also
- --------
- DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
- DataFrame.to_excel : Write object to an Excel sheet.
- DataFrame.to_sql : Write to a sql table.
- DataFrame.to_csv : Write a csv file.
- DataFrame.to_json : Convert the object to a JSON string.
- DataFrame.to_html : Render a DataFrame as an HTML table.
- DataFrame.to_string : Convert DataFrame to a string.
-
Notes
-----
This function writes the dataframe as a `feather file
@@ -2732,88 +2942,14 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
to_feather(self, path, **kwargs)
- @overload
- def to_markdown(
- self,
- buf: None = ...,
- *,
- mode: str = ...,
- index: bool = ...,
- storage_options: StorageOptions | None = ...,
- **kwargs,
- ) -> str: ...
-
- @overload
- def to_markdown(
- self,
- buf: FilePath | WriteBuffer[str],
- *,
- mode: str = ...,
- index: bool = ...,
- storage_options: StorageOptions | None = ...,
- **kwargs,
- ) -> None: ...
-
- @overload
- def to_markdown(
- self,
- buf: FilePath | WriteBuffer[str] | None,
- *,
- mode: str = ...,
- index: bool = ...,
- storage_options: StorageOptions | None = ...,
- **kwargs,
- ) -> str | None: ...
-
- def to_markdown(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- *,
- mode: str = "wt",
- index: bool = True,
- storage_options: StorageOptions | None = None,
- **kwargs,
- ) -> str | None:
- """
- Print DataFrame in Markdown-friendly format.
-
- Parameters
- ----------
- buf : str, Path or StringIO-like, optional, default None
- Buffer to write to. If None, the output is returned as a string.
- mode : str, optional
- Mode in which file is opened, "wt" by default.
- index : bool, optional, default True
- Add index (row) labels.
-
- storage_options : dict, optional
- Extra options that make sense for a particular storage connection, e.g.
- host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
- are forwarded to ``urllib.request.Request`` as header options. For other
- URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
- forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
- details, and for more examples on storage options refer `here
- `_.
-
- **kwargs
- These parameters will be passed to `tabulate `_.
-
- Returns
- -------
- str
- DataFrame in Markdown-friendly format.
-
- See Also
- --------
- DataFrame.to_html : Render DataFrame to HTML-formatted table.
- DataFrame.to_latex : Render DataFrame to LaTeX-formatted table.
-
- Notes
- -----
- Requires the `tabulate `_ package.
-
- Examples
+ @deprecate_nonkeyword_arguments(
+ version="3.0", allowed_args=["self", "buf"], name="to_markdown"
+ )
+ @doc(
+ Series.to_markdown,
+ klass=_shared_doc_kwargs["klass"],
+ storage_options=_shared_docs["storage_options"],
+ examples="""Examples
--------
>>> df = pd.DataFrame(
... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
@@ -2833,8 +2969,16 @@ def to_markdown(
| 0 | elk | dog |
+----+------------+------------+
| 1 | pig | quetzal |
- +----+------------+------------+
- """
+ +----+------------+------------+""",
+ )
+ def to_markdown(
+ self,
+ buf: FilePath | WriteBuffer[str] | None = None,
+ mode: str = "wt",
+ index: bool = True,
+ storage_options: StorageOptions | None = None,
+ **kwargs,
+ ) -> str | None:
if "showindex" in kwargs:
raise ValueError("Pass 'index' instead of 'showindex")
@@ -2854,33 +2998,35 @@ def to_markdown(
def to_parquet(
self,
path: None = ...,
- *,
engine: Literal["auto", "pyarrow", "fastparquet"] = ...,
compression: str | None = ...,
index: bool | None = ...,
partition_cols: list[str] | None = ...,
storage_options: StorageOptions = ...,
**kwargs,
- ) -> bytes: ...
+ ) -> bytes:
+ ...
@overload
def to_parquet(
self,
path: FilePath | WriteBuffer[bytes],
- *,
engine: Literal["auto", "pyarrow", "fastparquet"] = ...,
compression: str | None = ...,
index: bool | None = ...,
partition_cols: list[str] | None = ...,
storage_options: StorageOptions = ...,
**kwargs,
- ) -> None: ...
+ ) -> None:
+ ...
+ @deprecate_nonkeyword_arguments(
+ version="3.0", allowed_args=["self", "path"], name="to_parquet"
+ )
@doc(storage_options=_shared_docs["storage_options"])
def to_parquet(
self,
path: FilePath | WriteBuffer[bytes] | None = None,
- *,
engine: Literal["auto", "pyarrow", "fastparquet"] = "auto",
compression: str | None = "snappy",
index: bool | None = None,
@@ -2932,9 +3078,6 @@ def to_parquet(
Returns
-------
bytes if no path argument is provided else None
- Returns the DataFrame converted to the binary parquet format as bytes if no
- path argument. Returns None and writes the DataFrame to the specified
- location in the Parquet format if the path argument is provided.
See Also
--------
@@ -2946,22 +3089,16 @@ def to_parquet(
Notes
-----
- * This function requires either the `fastparquet
- `_ or `pyarrow
- `_ library.
- * When saving a DataFrame with categorical columns to parquet,
- the file size may increase due to the inclusion of all possible
- categories, not just those present in the data. This behavior
- is expected and consistent with pandas' handling of categorical data.
- To manage file size and ensure a more predictable roundtrip process,
- consider using :meth:`Categorical.remove_unused_categories` on the
- DataFrame before saving.
+ This function requires either the `fastparquet
+ `_ or `pyarrow
+ `_ library.
Examples
--------
- >>> df = pd.DataFrame(data={{"col1": [1, 2], "col2": [3, 4]}})
- >>> df.to_parquet("df.parquet.gzip", compression="gzip") # doctest: +SKIP
- >>> pd.read_parquet("df.parquet.gzip") # doctest: +SKIP
+ >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
+ >>> df.to_parquet('df.parquet.gzip',
+ ... compression='gzip') # doctest: +SKIP
+ >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
col1 col2
0 1 3
1 2 4
@@ -2989,36 +3126,6 @@ def to_parquet(
**kwargs,
)
- @overload
- def to_orc(
- self,
- path: None = ...,
- *,
- engine: Literal["pyarrow"] = ...,
- index: bool | None = ...,
- engine_kwargs: dict[str, Any] | None = ...,
- ) -> bytes: ...
-
- @overload
- def to_orc(
- self,
- path: FilePath | WriteBuffer[bytes],
- *,
- engine: Literal["pyarrow"] = ...,
- index: bool | None = ...,
- engine_kwargs: dict[str, Any] | None = ...,
- ) -> None: ...
-
- @overload
- def to_orc(
- self,
- path: FilePath | WriteBuffer[bytes] | None,
- *,
- engine: Literal["pyarrow"] = ...,
- index: bool | None = ...,
- engine_kwargs: dict[str, Any] | None = ...,
- ) -> bytes | None: ...
-
def to_orc(
self,
path: FilePath | WriteBuffer[bytes] | None = None,
@@ -3028,7 +3135,7 @@ def to_orc(
engine_kwargs: dict[str, Any] | None = None,
) -> bytes | None:
"""
- Write a DataFrame to the Optimized Row Columnar (ORC) format.
+ Write a DataFrame to the ORC format.
.. versionadded:: 1.5.0
@@ -3055,8 +3162,7 @@ def to_orc(
Returns
-------
- bytes if no ``path`` argument is provided else None
- Bytes object with DataFrame data if ``path`` is not specified else None.
+ bytes if no path argument is provided else None
Raises
------
@@ -3076,8 +3182,6 @@ def to_orc(
Notes
-----
- * Find more information on ORC
- `here `__.
* Before using this function you should read the :ref:`user guide about
ORC ` and :ref:`install optional dependencies `.
* This function requires `pyarrow `_
@@ -3089,9 +3193,9 @@ def to_orc(
Examples
--------
- >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
- >>> df.to_orc("df.orc") # doctest: +SKIP
- >>> pd.read_orc("df.orc") # doctest: +SKIP
+ >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
+ >>> df.to_orc('df.orc') # doctest: +SKIP
+ >>> pd.read_orc('df.orc') # doctest: +SKIP
col1 col2
0 1 4
1 2 3
@@ -3114,7 +3218,6 @@ def to_orc(
def to_html(
self,
buf: FilePath | WriteBuffer[str],
- *,
columns: Axes | None = ...,
col_space: ColspaceArgType | None = ...,
header: bool = ...,
@@ -3137,13 +3240,13 @@ def to_html(
table_id: str | None = ...,
render_links: bool = ...,
encoding: str | None = ...,
- ) -> None: ...
+ ) -> None:
+ ...
@overload
def to_html(
self,
buf: None = ...,
- *,
columns: Axes | None = ...,
col_space: ColspaceArgType | None = ...,
header: bool = ...,
@@ -3166,8 +3269,12 @@ def to_html(
table_id: str | None = ...,
render_links: bool = ...,
encoding: str | None = ...,
- ) -> str: ...
+ ) -> str:
+ ...
+ @deprecate_nonkeyword_arguments(
+ version="3.0", allowed_args=["self", "buf"], name="to_html"
+ )
@Substitution(
header_type="bool",
header="Whether to print column labels, default True",
@@ -3179,7 +3286,6 @@ def to_html(
def to_html(
self,
buf: FilePath | WriteBuffer[str] | None = None,
- *,
columns: Axes | None = None,
col_space: ColspaceArgType | None = None,
header: bool = True,
@@ -3214,13 +3320,9 @@ def to_html(
Convert the characters <, >, and & to HTML-safe sequences.
notebook : {True, False}, default False
Whether the generated HTML is for IPython Notebook.
- border : int or bool
- When an integer value is provided, it sets the border attribute in
- the opening tag, specifying the thickness of the border.
- If ``False`` or ``0`` is passed, the border attribute will not
- be present in the ``
`` tag.
- The default value for this parameter is governed by
- ``pd.options.display.html.border``.
+ border : int
+ A ``border=border`` attribute is included in the opening
+ `
` tag. Default ``pd.options.display.html.border``.
table_id : str, optional
A css id is included in the opening `