From 4bfbd3f353a197b3efb03472d49f9177b0ef0d58 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 8 Aug 2016 11:51:03 -0700 Subject: [PATCH 1/8] Add sphinx subproject for pandas2 designs --- doc/pandas-2.0/.gitignore | 2 + doc/pandas-2.0/Makefile | 233 ++++++++++++++++++++++++++ doc/pandas-2.0/index.rst | 22 +++ doc/pandas-2.0/make.bat | 263 +++++++++++++++++++++++++++++ doc/pandas-2.0/source/conf.py | 288 ++++++++++++++++++++++++++++++++ doc/pandas-2.0/source/goals.rst | 126 ++++++++++++++ doc/pandas-2.0/source/index.rst | 23 +++ 7 files changed, 957 insertions(+) create mode 100644 doc/pandas-2.0/.gitignore create mode 100644 doc/pandas-2.0/Makefile create mode 100644 doc/pandas-2.0/index.rst create mode 100644 doc/pandas-2.0/make.bat create mode 100644 doc/pandas-2.0/source/conf.py create mode 100644 doc/pandas-2.0/source/goals.rst create mode 100644 doc/pandas-2.0/source/index.rst diff --git a/doc/pandas-2.0/.gitignore b/doc/pandas-2.0/.gitignore new file mode 100644 index 0000000000000..30f5f674735cb --- /dev/null +++ b/doc/pandas-2.0/.gitignore @@ -0,0 +1,2 @@ +pandas2.github.io +_build diff --git a/doc/pandas-2.0/Makefile b/doc/pandas-2.0/Makefile new file mode 100644 index 0000000000000..f6476bc1102b4 --- /dev/null +++ b/doc/pandas-2.0/Makefile @@ -0,0 +1,233 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pandas20DesignDocs.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pandas20DesignDocs.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/pandas20DesignDocs" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pandas20DesignDocs" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +OUTPUTDIR=_build/html +DEPLOYREPOSITORY=pandas2.github.io + +deploy: html + if test -d $(OUTPUTDIR); \ + then echo " (build directory exists)"; \ + else mkdir -p $(OUTPUTDIR); \ + fi + if test -d $(DEPLOYREPOSITORY); \ + then echo " (repository directory exists)"; \ + else git clone git@github.com:pandas2/$(DEPLOYREPOSITORY).git; \ + fi + cd $(DEPLOYREPOSITORY) && git pull + rsync -r $(OUTPUTDIR)/* $(DEPLOYREPOSITORY)/ + cd $(DEPLOYREPOSITORY) && git add . && git commit -m "deploy" + cd $(DEPLOYREPOSITORY) && git push origin master diff --git a/doc/pandas-2.0/index.rst b/doc/pandas-2.0/index.rst new file mode 100644 index 0000000000000..6775e45c84b1f --- /dev/null +++ b/doc/pandas-2.0/index.rst @@ -0,0 +1,22 @@ +.. pandas 2.0 Design Docs documentation master file, created by + sphinx-quickstart on Mon Aug 8 11:48:39 2016. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to pandas 2.0 Design Docs's documentation! +================================================== + +Contents: + +.. toctree:: + :maxdepth: 2 + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/doc/pandas-2.0/make.bat b/doc/pandas-2.0/make.bat new file mode 100644 index 0000000000000..4c22f56b72e9f --- /dev/null +++ b/doc/pandas-2.0/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pandas20DesignDocs.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pandas20DesignDocs.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/doc/pandas-2.0/source/conf.py b/doc/pandas-2.0/source/conf.py new file mode 100644 index 0000000000000..56217e1eda54b --- /dev/null +++ b/doc/pandas-2.0/source/conf.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# pandas 2.0 Design Docs documentation build configuration file, created by +# sphinx-quickstart on Mon Aug 8 11:48:39 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'pandas 2.0 Design Docs' +copyright = '2016, pandas Core Team' +author = 'pandas Core Team' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +import sphinx_rtd_theme + +html_theme = "sphinx_rtd_theme" + +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pandas20DesignDocsdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pandas20DesignDocs.tex', 'pandas 2.0 Design Docs Documentation', + 'pandas Core Team', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pandas20designdocs', 'pandas 2.0 Design Docs Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pandas20DesignDocs', 'pandas 2.0 Design Docs Documentation', + author, 'pandas20DesignDocs', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/doc/pandas-2.0/source/goals.rst b/doc/pandas-2.0/source/goals.rst new file mode 100644 index 0000000000000..0b11375947ef9 --- /dev/null +++ b/doc/pandas-2.0/source/goals.rst @@ -0,0 +1,126 @@ +.. _goals: + +======================= + Goals and Motivations +======================= + +.. note:: + + These documents are largely written by Wes McKinney, and at this point + reflect his opinions for the time being + +The pandas codebase is now over 8 years old, having grown to over 200,000 lines +of code from its original ~10,000 LOC in the original 0.1 open source release +in January 2010. + +At a high level, the "pandas 2.0" effort is based on a couple of observations: + +* The pandas 0.x series of releases have been primarily iterative improvements + to the library, with new features, bug fixes, and improved + documentation. There have also been a series of deprecations, API changes, + and other evolutions of pandas's API to account for suboptimal design choices + (for example: the ``.ix`` operator) made in the early days of the project + (2010 to 2012). +* pandas's ability to support an increasingly broad set of use cases has been + significantly constrained (as will be examined in detail in these documents) + by its tight coupling to NumPy and therefore subject to design limitations in + NumPy. +* Making significant functional additions (particularly filling gaps in NumPy) + to pandas, particularly new data types, has grown increasingly complex with + very obvious accumulations of technical debt. +* pandas is being used increasingly for very large datasets on machines with + many cores and large amounts of RAM (100s of gigabytes to terabytes) +* Rough edges in pandas's implementation (e.g. its handling of missing data + across data types) are being exposed to users. + +These documents are largely concerned with pandas's internal design, which is +mostly invisible to average users. Advanced users of pandas are generally +familiar with some of these internal details, particular around performance and +memory use, and so the degree to which users are impacted will vary quite a +lot. + +Key areas of work +================= + +Possible changes or improvements to pandas's internals fall into a number of +different buckets to be explored in great detail: + +* **Decoupling from NumPy while preserving interoperability**: by eliminating + the presumption that pandas objects internally must contain data stored in + NumPy ``ndarray`` objects, we will be able to bring more consistency to + pandas's semantics and enable the core developers to extend pandas more + cleanly with new data types, data structures, and computational semantics. +* **Exposing a pandas Cython and/or C/C++ API to other Python library + developers**: the internals of Series and DataFrame are only weakly + accessible in other developers' native code. At minimum, we wish to better + enable developers to construct the precise data structures / memory + representation that fill the insides of Series and DataFrame. +* **Improving user control and visibility of memory use**: pandas's memory use, + as a result of its internal implementation, can frequently be opaque to the + user or outright unpredictable. +* **Improving performance and system utilization**: We aim to improve both the + micro (operations that take < 1 ms) and macro (all other operations) + performance of pandas across the board. As part of this, we aim to make it + easier for pandas's core developers to leverage multicore systems to + accelerate computations (without running into any of Python's well-known + concurrency limitations) +* **Removal of deprecated / underutilized functionality**: As the Python data + ecosystem has grown, a number of areas of pandas (e.g. plotting and datasets + with more than 2 dimensions) may be better served by other open source + projects. Also, functionality that has been explicitly deprecated or + discouraged from use (like the ``.ix`` indexing operator) would ideally be + removed. + +Non-goals / FAQ +=============== + +As this will be a quite nuanced discussion, especially for those not intimately +familiar with pandas's implementation details, I wanted to speak to a couple of +commonly-asked questions in brief: + +1. **Will this work make it harder to use pandas with NumPy, scikit-learn, + statsmodels, SciPy, or other libraries that depend on NumPy + interoperability?** + * We are not planning on it. Data that is representable without memory + copying or conversion in NumPy arrays will continue to be 100% + interoperable. + * Data containing missing (NA) values may require explicit conversion where + it is not currently required. For example: integer or boolean type arrays + with missing data. I trust this will be seen as a positive development. + * If anything, more performant and more precise data semantics in pandas will + generally make production code using a downstream library like scikit-learn + more dependable and future-proof. + +2. **By decoupling from NumPy, it sounds like you are reimplementing NumPy or + adding a new data type system** + + * Simply put: no. But it's more complicated than that because of the + numerous interpretations of "type system". + + * pandas already contains a large amount (10s of KLOCs) of custom + computational code (see, for example, + `https://github.com/pydata/pandas/tree/master/pandas/src`) that implements + functionality not present in NumPy. + + * pandas already features its own (what can be called) "logical type + system", including things like custom data types (such as that of + ``pandas.Categorical``), pandas-specific missing data representation, and + implicit type casting (e.g. integer to float on introduction of missing + data). Unfortunately, these logical data types are somewhat weakly + expressed, and the mix of NumPy dtype objects and custom pandas types is + problematic for many internal (implementation) and external (user API) + reasons. I will examine in detail the difference between **physical + types** (i.e. NumPy's dtypes) and **logical types** (i.e. what pandas + currently has, implicitly). + +Summary +======= + +Overall, the goal of the pandas 2.0 project is to yield a faster, more cleanly +architected, and more future-proof library that is a drop-in replacement for +90-95% of pandas user code. There will be API / code breakages, but the intent +of any code breakage will almost always be to fix something that has been +"wrong" or inconsistent. Many advanced users will have worked around some of +these rough edges, and so their workarounds may either need to be removed or +changed to accommodate the new (and hopefully it can be agreed in each case: +better) semantics. diff --git a/doc/pandas-2.0/source/index.rst b/doc/pandas-2.0/source/index.rst new file mode 100644 index 0000000000000..8d64661114a37 --- /dev/null +++ b/doc/pandas-2.0/source/index.rst @@ -0,0 +1,23 @@ +pandas 2.0 Design Documents +=========================== + +These are a set of documents, based on discussions started in December 2015, to +assist with discussions around changes to Python pandas's internal design +forward to better accommodate the evolving needs of the growing Python data +userbase and to help keep pandas a relevant and important project in the +future. + +.. toctree:: + :maxdepth: 3 + + goals + internal-architecture + copyonwrite + strings + +.. Indices and tables +.. ================== + +.. * :ref:`genindex` +.. * :ref:`modindex` +.. * :ref:`search` From 26bb013e690dc42ff6faa5bc4881483b86530261 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 8 Aug 2016 12:20:28 -0700 Subject: [PATCH 2/8] More faq --- doc/pandas-2.0/source/goals.rst | 81 +++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 10 deletions(-) diff --git a/doc/pandas-2.0/source/goals.rst b/doc/pandas-2.0/source/goals.rst index 0b11375947ef9..a820d83809282 100644 --- a/doc/pandas-2.0/source/goals.rst +++ b/doc/pandas-2.0/source/goals.rst @@ -13,18 +13,23 @@ The pandas codebase is now over 8 years old, having grown to over 200,000 lines of code from its original ~10,000 LOC in the original 0.1 open source release in January 2010. -At a high level, the "pandas 2.0" effort is based on a couple of observations: - -* The pandas 0.x series of releases have been primarily iterative improvements - to the library, with new features, bug fixes, and improved - documentation. There have also been a series of deprecations, API changes, - and other evolutions of pandas's API to account for suboptimal design choices - (for example: the ``.ix`` operator) made in the early days of the project - (2010 to 2012). +At a high level, the "pandas 2.0" effort is based on a number of observations: + +* The pandas 0.x series of releases have consisted with huge amounts of + iterative improvements to the library along with some major new features, bug + fixes, and improved documentation. There have also been a series of + deprecations, API changes, and other evolutions of pandas's API to account + for suboptimal design choices (for example: the ``.ix`` operator) made in the + early days of the project (2010 to 2012). +* The unification of Series and DataFrame internals to be based on a common + ``NDFrame`` base class and "block manager" data structure (heroically + championed by Jeff Reback), while introducing many benefits to pandas, has + come to be viewed as a long-term source of technical debt and code + complexity. * pandas's ability to support an increasingly broad set of use cases has been significantly constrained (as will be examined in detail in these documents) - by its tight coupling to NumPy and therefore subject to design limitations in - NumPy. + by its tight coupling to NumPy and therefore subject to various limitations + in NumPy. * Making significant functional additions (particularly filling gaps in NumPy) to pandas, particularly new data types, has grown increasingly complex with very obvious accumulations of technical debt. @@ -78,6 +83,8 @@ As this will be a quite nuanced discussion, especially for those not intimately familiar with pandas's implementation details, I wanted to speak to a couple of commonly-asked questions in brief: +```` + 1. **Will this work make it harder to use pandas with NumPy, scikit-learn, statsmodels, SciPy, or other libraries that depend on NumPy interoperability?** @@ -91,6 +98,8 @@ commonly-asked questions in brief: generally make production code using a downstream library like scikit-learn more dependable and future-proof. +```` + 2. **By decoupling from NumPy, it sounds like you are reimplementing NumPy or adding a new data type system** @@ -113,6 +122,58 @@ commonly-asked questions in brief: types** (i.e. NumPy's dtypes) and **logical types** (i.e. what pandas currently has, implicitly). +```` + +3. **Shouldn't you try to accomplish your goals by contributing work to NumPy + instead of investing major work in pandas's internals?** + + * In my opinion, this is a "false dichotomy"; i.e. these things are not + mutually exclusive. + + * Yes, we should define, scope, and if possible help implement improvements + to NumPy that make sense. As NumPy serves a significantly larger and more + diverse set of users, major changes to the NumPy C codebase must be + approached more conservatively. + + * It is unclear that pandas's body of domain-specific data handling and + computational code is entirely "in scope" for NumPy. Some technical + details, such as our categorical or datetime data semantics, "group by" + functionality, relational algebra (joins), etc., may be ideal for pandas + but not necessarily ideal for a general user of NumPy. My opinion is that + functionality from NumPy we wish to use in pandas should "pass through" to + the user unmodified, but we must retain the flexibility to work "outside + the box" (implement things not found in NumPy) without adding technical + debt or user API complexity. + +```` + +4. **API changes / breaks are thought to be bad; don't you have a + responsibility to maintain backwards compatibility for users that heavily + depend on pandas?** + + * It's true that APIs should not be broken or changed, and as such should be + approached with extreme caution. + + * The goal of the pandas 2.0 initiative is to only make "good" API breaks + that yield a net benefit that can be easily demonstrated. As an example: + adding native missing data support to integer and boolean data (without + casting to another physical storage type) may break user code that has + knowledge of the "rough edge" (the behavior that we are fixing). As these + changes will mostly affect advanced pandas users, I expect they will be + welcomed. + + * Any major API change or break will be documented and justified to assist + with code migration. + + * As soon as we are able, we will post binary development artifacts for the + pandas 2.0 development branch to get early feedback from heavy pandas + users to understand the impact of changes and how we can better help the + existing user base. + + * Some users will find that a certain piece of code has been working "by + accident" (i.e. relying upon undocumented behavior). This kind of breakage + is already a routine occurrence unfortunately. + Summary ======= From ec953d2975d13df57a0c9b2680a8e2e912c2b7ab Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 8 Aug 2016 13:14:26 -0700 Subject: [PATCH 3/8] Some exposition on missing data --- doc/pandas-2.0/source/conf.py | 12 +- doc/pandas-2.0/source/goals.rst | 3 +- doc/pandas-2.0/source/index.rst | 4 +- .../source/internal-architecture.rst | 139 ++++++++++++++++++ 4 files changed, 150 insertions(+), 8 deletions(-) create mode 100644 doc/pandas-2.0/source/internal-architecture.rst diff --git a/doc/pandas-2.0/source/conf.py b/doc/pandas-2.0/source/conf.py index 56217e1eda54b..cf536bfa7d9b6 100644 --- a/doc/pandas-2.0/source/conf.py +++ b/doc/pandas-2.0/source/conf.py @@ -24,12 +24,14 @@ # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [] + +extensions = ['IPython.sphinxext.ipython_directive', + 'IPython.sphinxext.ipython_console_highlighting'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -46,9 +48,9 @@ master_doc = 'index' # General information about the project. -project = 'pandas 2.0 Design Docs' -copyright = '2016, pandas Core Team' -author = 'pandas Core Team' +project = "Wes's pandas 2.0 Design Docs" +copyright = '2016, Wes McKinney' +author = 'Wes McKinney' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/doc/pandas-2.0/source/goals.rst b/doc/pandas-2.0/source/goals.rst index a820d83809282..5d4c592437f1a 100644 --- a/doc/pandas-2.0/source/goals.rst +++ b/doc/pandas-2.0/source/goals.rst @@ -7,7 +7,8 @@ .. note:: These documents are largely written by Wes McKinney, and at this point - reflect his opinions for the time being + reflect his opinions for the time being. Many things may change as we discuss + and work to reach a consensus about the path forward. The pandas codebase is now over 8 years old, having grown to over 200,000 lines of code from its original ~10,000 LOC in the original 0.1 open source release diff --git a/doc/pandas-2.0/source/index.rst b/doc/pandas-2.0/source/index.rst index 8d64661114a37..de8b825945322 100644 --- a/doc/pandas-2.0/source/index.rst +++ b/doc/pandas-2.0/source/index.rst @@ -1,5 +1,5 @@ -pandas 2.0 Design Documents -=========================== +Wes's pandas 2.0 Design Documents +================================= These are a set of documents, based on discussions started in December 2015, to assist with discussions around changes to Python pandas's internal design diff --git a/doc/pandas-2.0/source/internal-architecture.rst b/doc/pandas-2.0/source/internal-architecture.rst new file mode 100644 index 0000000000000..1ed812cd8e943 --- /dev/null +++ b/doc/pandas-2.0/source/internal-architecture.rst @@ -0,0 +1,139 @@ +.. _internal-architecture: + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + np.set_printoptions(precision=4, suppress=True) + pd.options.display.max_rows = 100 + +=============================== + Internal Architecture Changes +=============================== + +Logical types and Physical Storage Decoupling +============================================= + +Removal of BlockManager / new DataFrame internals +================================================= + +``pandas.Array`` and ``pandas.Table`` +===================================== + +Missing data consistency +======================== + +Once the physical memory representation has been effectively decoupled from the +user API, we can consider various approaches to implementing missing data in a +consistent way for every logical pandas data type. + +To motivate this, let's look at some integer data: + +.. ipython:: python + + s = pd.Series([1, 2, 3, 4, 5]) + s + s.dtype + s.values + +If we assign a ``numpy.NaN``, see what happens: + +.. ipython:: python + + s[2] = np.NaN + s + s.dtype + s.values + +The story for boolean data is similar: + +.. ipython:: python + + s = pd.Series([True, False, True]) + s.dtype + s[2] = np.NaN + s.dtype + s.values + +This implicit behavior appears in many scenarios, such as: + +* Loading data from any source: databases, CSV files, R data files, etc. +* Joins or reindexing operations introducing missing data +* Pivot / reshape operations +* Time series resampling +* Certain types of GroupBy operations + +A proposed solution +~~~~~~~~~~~~~~~~~~~ + +My proposal for introducing missing data into any NumPy type outside of +floating point (which uses ``NaN`` for now) and Python object (which uses +``None`` or ``NaN`` interchangeably) is to **allocate and manage an internal +bitmap** (which the user never sees). This has numerous benefits: + +* 1 byte of memory overhead for each 8 values +* Bitmaps can propagate their nulls in C through bitwise ``&`` or ``|`` + operations, which are inexpensive. +* Getting and setting bits on modern hardware is very CPU-inexpensive. For + single-pass array operations (like groupbys) on very large arrays this may + also result in better CPU cache utilization (fewer main-memory reads of the + bitmap). +* Hardware and SIMD "popcount" intrinsics (which can operate on 64-128 bits at + a time) can be used to count bits and skip null-handling on segments of data + containing no nulls. + +Notably, this is the way that PostgreSQL handles null values. For example, we +might have: + +.. code-block:: + + [0, 1, 2, NA, NA, 5, 6, NA] + + i: 7 6 5 4 3 2 1 0 + bitmap: 0 1 1 0 0 1 1 1 + +Here, the convention of 1 for "not null" (a la PostgreSQL) and +least-significant bit ordering (LSB "bit endianness") is being used. + +Under the new regime, users could simply write: + +.. code-block:: python + + s[2] = pandas.NA + +and the data type would be unmodified. It may be necessary to write something +akin to: + +.. code-block:: python + + s.to_numpy(dtype=np.float64, na_rep=np.nan) + +and that would emulate the current behavior. Attempts to use ``__array__` (for +example: calling ``np.sqrt`` on the data) would result in an error since we +will likely want to refuse to make a guess as for what casting behavior the +user desires. + +Tradeoffs +~~~~~~~~~ + +One potential downside of the bitmap approach is that missing data implemented +outside of NumPy's domain will need to be explicitly converted if it is needed +in another library that only knows about NumPy. I argue that this is better +than the current + +Proper types for strings and some non-numeric data +================================================== + +I believe that frequently-occurring data types, such as UTF8 strings, are +important enough to deserve a dedicated logical pandas data type. This will +enable us both to enforce tighter API semantics (i.e. attempts to assign a +non-string into string data will be a ``TypeError``) and improved performance +and memory use under the hood. I will devote an entire section to talking about +strings. + +C++11/14 for lowest implementation tier +======================================= + +3rd-party native API (i.e. Cython and C / C++) +============================================== From 2684160a4526a80a479f15c45de9e0af484bf15f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 8 Aug 2016 13:17:53 -0700 Subject: [PATCH 4/8] Deploy to wesm/pandas2-design for now --- doc/pandas-2.0/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/pandas-2.0/Makefile b/doc/pandas-2.0/Makefile index f6476bc1102b4..0bb0875e75ae5 100644 --- a/doc/pandas-2.0/Makefile +++ b/doc/pandas-2.0/Makefile @@ -216,7 +216,7 @@ pseudoxml: @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." OUTPUTDIR=_build/html -DEPLOYREPOSITORY=pandas2.github.io +DEPLOYREPOSITORY=pandas2-design deploy: html if test -d $(OUTPUTDIR); \ @@ -225,7 +225,7 @@ deploy: html fi if test -d $(DEPLOYREPOSITORY); \ then echo " (repository directory exists)"; \ - else git clone git@github.com:pandas2/$(DEPLOYREPOSITORY).git; \ + else git clone git@github.com:wesm/$(DEPLOYREPOSITORY).git; \ fi cd $(DEPLOYREPOSITORY) && git pull rsync -r $(OUTPUTDIR)/* $(DEPLOYREPOSITORY)/ From 136ade91e03d4b988454b97eb5d726c477cfe7f1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 8 Aug 2016 15:27:36 -0700 Subject: [PATCH 5/8] Draft some string exposition --- doc/pandas-2.0/source/copyonwrite.rst | 5 + doc/pandas-2.0/source/goals.rst | 20 +- doc/pandas-2.0/source/index.rst | 3 +- .../source/internal-architecture.rst | 17 ++ doc/pandas-2.0/source/removals.rst | 46 +++++ doc/pandas-2.0/source/strings.rst | 194 ++++++++++++++++++ 6 files changed, 277 insertions(+), 8 deletions(-) create mode 100644 doc/pandas-2.0/source/copyonwrite.rst create mode 100644 doc/pandas-2.0/source/removals.rst create mode 100644 doc/pandas-2.0/source/strings.rst diff --git a/doc/pandas-2.0/source/copyonwrite.rst b/doc/pandas-2.0/source/copyonwrite.rst new file mode 100644 index 0000000000000..321614171c873 --- /dev/null +++ b/doc/pandas-2.0/source/copyonwrite.rst @@ -0,0 +1,5 @@ +.. _copyonwrite: + +================================== + View semantics and Copy-On-Write +================================== diff --git a/doc/pandas-2.0/source/goals.rst b/doc/pandas-2.0/source/goals.rst index 5d4c592437f1a..cc2ff6467e271 100644 --- a/doc/pandas-2.0/source/goals.rst +++ b/doc/pandas-2.0/source/goals.rst @@ -23,10 +23,10 @@ At a high level, the "pandas 2.0" effort is based on a number of observations: for suboptimal design choices (for example: the ``.ix`` operator) made in the early days of the project (2010 to 2012). * The unification of Series and DataFrame internals to be based on a common - ``NDFrame`` base class and "block manager" data structure (heroically - championed by Jeff Reback), while introducing many benefits to pandas, has - come to be viewed as a long-term source of technical debt and code - complexity. + ``NDFrame`` base class and "block manager" data structure (originally created + by me in 2011, and heroically driven forward to its modern form by Jeff + Reback), while introducing many benefits to pandas, has come to be viewed as + a long-term source of technical debt and code complexity. * pandas's ability to support an increasingly broad set of use cases has been significantly constrained (as will be examined in detail in these documents) by its tight coupling to NumPy and therefore subject to various limitations @@ -35,7 +35,13 @@ At a high level, the "pandas 2.0" effort is based on a number of observations: to pandas, particularly new data types, has grown increasingly complex with very obvious accumulations of technical debt. * pandas is being used increasingly for very large datasets on machines with - many cores and large amounts of RAM (100s of gigabytes to terabytes) + many cores and large amounts of RAM (100s of gigabytes to terabytes). It + would be nice to be able to better utilize these larger, beefier systems + within a single Python process. +* pandas is being used increasingly as a computational building block of some + larger system, such as Dask or Apache Spark. We should consider reducing the + overhead for making data accessible to pandas (i.e. via memory-mapping or + other low-overhead memory sharing). * Rough edges in pandas's implementation (e.g. its handling of missing data across data types) are being exposed to users. @@ -109,10 +115,10 @@ commonly-asked questions in brief: * pandas already contains a large amount (10s of KLOCs) of custom computational code (see, for example, - `https://github.com/pydata/pandas/tree/master/pandas/src`) that implements + ``_) that implements functionality not present in NumPy. - * pandas already features its own (what can be called) "logical type + * pandas already features its own (what I will describe as a) "logical type system", including things like custom data types (such as that of ``pandas.Categorical``), pandas-specific missing data representation, and implicit type casting (e.g. integer to float on introduction of missing diff --git a/doc/pandas-2.0/source/index.rst b/doc/pandas-2.0/source/index.rst index de8b825945322..d1444feb6231e 100644 --- a/doc/pandas-2.0/source/index.rst +++ b/doc/pandas-2.0/source/index.rst @@ -12,8 +12,9 @@ future. goals internal-architecture - copyonwrite strings + copyonwrite + removals .. Indices and tables .. ================== diff --git a/doc/pandas-2.0/source/internal-architecture.rst b/doc/pandas-2.0/source/internal-architecture.rst index 1ed812cd8e943..e7bdf016d94d4 100644 --- a/doc/pandas-2.0/source/internal-architecture.rst +++ b/doc/pandas-2.0/source/internal-architecture.rst @@ -132,8 +132,25 @@ non-string into string data will be a ``TypeError``) and improved performance and memory use under the hood. I will devote an entire section to talking about strings. +In general, I would be supportive of making Python object (``numpy.object_`` +dtype) arrays the solution only for mixed-type arrays and data types for which +pandas has no native handling. + +Permitting "other" (non-NumPy) data structures +============================================== + + + C++11/14 for lowest implementation tier ======================================= +Currently, pandas architecturally is structured as follows: + +* Pure Python implementation of internal data structure business logic +* Algorithms in Cython (more often) or C (less often) to accelerate + computationally-intensive algorithms + + + 3rd-party native API (i.e. Cython and C / C++) ============================================== diff --git a/doc/pandas-2.0/source/removals.rst b/doc/pandas-2.0/source/removals.rst new file mode 100644 index 0000000000000..86c41b2606d36 --- /dev/null +++ b/doc/pandas-2.0/source/removals.rst @@ -0,0 +1,46 @@ +.. _removals: + +================================ + Code to remove and other ideas +================================ + +Dropping Python 2 support +========================= + +Controversional ideas +===================== + +Strided arrays: more trouble than they are worth? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Per the general discussion around changing DataFrame's internals to contain a +list / ``std::vector`` of arrays internally, for me this begs the question of +the benefits of continuing to accommodate strided one-dimensional data. + +Some pros for eliminating strided data completely: + +* Guaranteeing contiguous memory internally will yield more consistent and + predictable performance. + +* Not needing to consider a stride different from 1 means simpler low-level + array indexing code (e.g. you can work with plain C arrays). The stride is a + complexity / overhead that leaks to every algorithm that iterates over an + array. + +* You avoid strange situations where a strided view holds onto a base ndarray + reference to a much larger array + +* **Example:** ``_. Here, the + internal orientation (column-major vs. row-major) is not clear to the user. + +Some cons: + +* It would not be possible to perform zero-copy computations on a strided NumPy + array + +* Relatedly, initializing a Series or DataFrame from strided memory would + require allocating an equivalent amount of contiguous memory for each of the + columns. + +For me, at least, I don't find the cons compelling enough to warrant the code +complexity tradeoff. diff --git a/doc/pandas-2.0/source/strings.rst b/doc/pandas-2.0/source/strings.rst new file mode 100644 index 0000000000000..110fe644f09e0 --- /dev/null +++ b/doc/pandas-2.0/source/strings.rst @@ -0,0 +1,194 @@ +.. _strings: + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + np.set_printoptions(precision=4, suppress=True) + pd.options.display.max_rows = 100 + +================================== + Enhanced string / UTF-8 handling +================================== + +There are some things we can do to make pandas use less memory and perform +computations significantly faster on string data. + +Current string problems +======================= + +pandas offers support for columns containing strings (ASCII or Unicode) on a +somewhat ad hoc basis. + +* Strings are stored in NumPy arrays of ``PyObject*`` / ``numpy.object_`` + dtype. This has several problems + + * Computations (e.g. ``groupby`` operations) typically utilize a code path + for generic Python objects. For example comparisons or hashing goes through + the ``PyObject_*`` C API functions. In addition to harming multithreading + due to GIL contention (you must acquire the GIL to use these functions), + these can also be significantly slower than algorithms that operate on + ``const char*``, potentially taking advantage of hardware optimizations. + + * String arrays often feature many copies of or references to the same + PyString. Thus, some algorithms may perform redundant computation. Some + parts of pandas, like ``pandas.read_csv``, make an effort to deduplicate + strings to free memory and accelerate computations (e.g. if you do ``x == + y``, and ``x`` and ``y`` are references to the same ``PyObject*``, Python + skips comparing their internal data). + + * Note that this is somewhat mitigated by using ``pandas.Categorical``, but + this is not the default storage mechanism. More on this below. + + * Using ``PyString`` objects and ``PyObject*`` NumPy storage adds non-trivial + overhead (approximately 24 bytes per unique object, see `this exposition + `_ for a deeper drive) to + each value. + +Possible solution: new non-NumPy string memory layout +===================================================== + +My proposed solution to the string conundrum is the following: + +* Create a custom string array container type suitable for use in a + ``pandas.Array``, and a ``pandas.string`` logical data type. +* Require that all strings be encoded as UTF-8. +* By default, represent all string arrays internally as dictionary-encoded + a.k.a. categorical. Thus, we will typically only ever have 1 copy of any + given string in an array. +* Store the actual string data in a packed UTF-8 buffer. I have seen this in a + number of places, but notably it's the way that `Apache Arrow implements + variable-length collections + `_. + +Here is one possible C struct-like layout of this container: + +.. code-block:: c++ + + typedef struct { + /* Category / dictionary indices into the string data */ + uint32_t* indices; + + /* The encoded string lengths */ + uint32_t* offsets; + + /* The packed UTF-8 data */ + const char* data; + + /* For nullness */ + uint8_t* bitmap; + } string_array_t; + +Here's an example of what the data would look like: + +.. code-block:: text + + actual data : ['foo', 'bars', 'foo', null, 'bars'] + + indices: [0, 1, 0, 0, 1] + + bitmap[0] + bitmap (read right-to-left): 0 0 0 1 0 1 1 1 | + + offsets: [0, 3, 7] + data: ['f', 'o', 'o', 'b', 'a', 'r', 's'] + +Some benefits of this approach include: + +* Much better data locality for low-cardinality categorical data +* 8.125 bytes (8 bytes plus 1 bit) of memory overhead per value versus 24 bytes + (the current) +* The data is already categorical: cast to ``category`` dtype can be perform + very cheaply and without duplicating the underlying string memory buffer +* Computations like ``groupby`` on dictionary-encoded strings will be as + performant as those on Categorical currently are. performant + +Some drawbacks + +* Mutating slots in a string array becomes more complex. Whether single value + assignments or put / array-assignment may likely require creating a new + ``data`` buffer. Without a compaction / "garbage collection" step on this + buffer it will be possible to have "dead" memory inside it (for example, if + you did ``arr[:] = 'a-new-string-value'``, all the existing values would be + orphaned). + + * Some systems have addressed this issue by storing all string data in a + "global string hash table". This is something we could explore, but it + would add quite a bit of complexity to implement and may not be worthwhile + at this time. + +* Indexing into this data structure to obtain a single Python object will + probably want to call ``PyUnicode_FromStringAndSize`` to construct a string + (Python 3, therefore Unicode). This requires a memory allocation, where + currently only a ``Py_INCREF``. + +* Many of pandas's existing algorithms assuming Python objects would need to be + specialized to take advantage of this new memory layout. This is both a pro + and a con as it will most likely yield significantly better performance. + +Concerns / problems +=================== + +Preserving code that assumes PyString objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Any alternate UTF-8 string in-memory representation should necessarily be able +to yield Python string objects using ``PyUnicode_FromStringAndSize``. Thus, +code like this could continue to work: + +.. ipython:: python + + s = pd.Series(["como estás?"]) + s.map(lambda x: x.upper()) + +One trade-off is that creating the temporary Python strings is potentially +costly. This could be mitigated for Python ``str`` methods (optimized +array-oriented code path under the hood), but for arbitrary functions you would +have to pay. + +Accommodating Non-UTF-8 data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some pandas users will have code that involves various non-UTF-8 Python string +types: + +* Native unicode: Py_UCS1, Py_UCS2, Py_UCS4 +* Non-UTF-8 PyBytes + +.. ipython:: python + + s = pd.Series(["como estás?"]) + s + s.str.encode('latin-1') + s.str.encode('latin-1').str.decode('latin-1') + +Such data could arise from reading a CSV file in a non-UTF-8 encoding, and you +did not indicate the encoding to ``pandas.read_csv``. + +My proposed solution to this is to provide a ``binary`` logical type having the +same physical memory layout as UTF-8 strings, with only the metadata being +different. So you would have the following semantics: + +* ``latin1_s = s.encode('latin-1')``: this yields a ``binary`` view and + allocates new memory. +* ``utf8_s = s.encode('utf-8')``: this is a no-op, but yields a ``binary`` view. +* ``s2 = utf8_s.decode('utf-8')``: this requires using a Unicode codec to + validate indicated codec. + +Indexing and slicing +~~~~~~~~~~~~~~~~~~~~ + +Storing strings as UTF-8 bytes means that things like this become more +complicated: + +.. ipython:: python + + s = pd.Series(["estás está estáis"]) + s.str[9] + s.str[6:10] + +Since UTF-8 is a variable length encoding, finding the logical character by +position will need to make use of the Python C API (expensive, requires +creating new Python objects) or a 3rd party library. We could make use of the +`ICU C++ Libraries `_ to implement this. From eda2cff26d1a44f8ea59d34c71603850b5c807c5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 8 Aug 2016 17:09:32 -0700 Subject: [PATCH 6/8] Part of drafting logical type section --- doc/pandas-2.0/.gitignore | 2 +- doc/pandas-2.0/Makefile | 2 +- doc/pandas-2.0/source/capi.rst | 5 + doc/pandas-2.0/source/index.rst | 7 +- .../source/internal-architecture.rst | 234 +++++++++++++++++- doc/pandas-2.0/source/removals.rst | 32 ++- doc/pandas-2.0/source/strings.rst | 17 +- 7 files changed, 281 insertions(+), 18 deletions(-) create mode 100644 doc/pandas-2.0/source/capi.rst diff --git a/doc/pandas-2.0/.gitignore b/doc/pandas-2.0/.gitignore index 30f5f674735cb..aeffbbed984ff 100644 --- a/doc/pandas-2.0/.gitignore +++ b/doc/pandas-2.0/.gitignore @@ -1,2 +1,2 @@ -pandas2.github.io +pandas2-design _build diff --git a/doc/pandas-2.0/Makefile b/doc/pandas-2.0/Makefile index 0bb0875e75ae5..654dda170fe37 100644 --- a/doc/pandas-2.0/Makefile +++ b/doc/pandas-2.0/Makefile @@ -230,4 +230,4 @@ deploy: html cd $(DEPLOYREPOSITORY) && git pull rsync -r $(OUTPUTDIR)/* $(DEPLOYREPOSITORY)/ cd $(DEPLOYREPOSITORY) && git add . && git commit -m "deploy" - cd $(DEPLOYREPOSITORY) && git push origin master + cd $(DEPLOYREPOSITORY) && git push origin gh-pages diff --git a/doc/pandas-2.0/source/capi.rst b/doc/pandas-2.0/source/capi.rst new file mode 100644 index 0000000000000..6c16aee8f6ed2 --- /dev/null +++ b/doc/pandas-2.0/source/capi.rst @@ -0,0 +1,5 @@ +.. _capi: + +============================================= + A native code API (Cython / C++) for pandas +============================================= diff --git a/doc/pandas-2.0/source/index.rst b/doc/pandas-2.0/source/index.rst index d1444feb6231e..62612a4e596de 100644 --- a/doc/pandas-2.0/source/index.rst +++ b/doc/pandas-2.0/source/index.rst @@ -3,9 +3,9 @@ Wes's pandas 2.0 Design Documents These are a set of documents, based on discussions started in December 2015, to assist with discussions around changes to Python pandas's internal design -forward to better accommodate the evolving needs of the growing Python data -userbase and to help keep pandas a relevant and important project in the -future. +intended to better accommodate the evolving needs of the growing Python data +userbase and to help ensure that pandas remains a relevant and important +project in the future. .. toctree:: :maxdepth: 3 @@ -14,6 +14,7 @@ future. internal-architecture strings copyonwrite + capi removals .. Indices and tables diff --git a/doc/pandas-2.0/source/internal-architecture.rst b/doc/pandas-2.0/source/internal-architecture.rst index e7bdf016d94d4..7f51b784db53d 100644 --- a/doc/pandas-2.0/source/internal-architecture.rst +++ b/doc/pandas-2.0/source/internal-architecture.rst @@ -15,6 +15,193 @@ Logical types and Physical Storage Decoupling ============================================= +Since this is the most important, but perhaps also most controversial, change +(in my opinion) to pandas, I'm going to go over it in great detail. I think the +hardest part of coming up with clear language and definitions for concepts so +that we can communicate effectively. For example the term "data type" is vague +and may mean different things to different people. + +A motivating example +~~~~~~~~~~~~~~~~~~~~ + +Before digging too much into the technical details and problems/solutions, +let's look at some code examples. It is not unusual to find code like this in +pandas's internals: + +.. code-block:: python + + def create_from_value(value, index, dtype): + # return a new empty value suitable for the dtype + + if is_datetimetz(dtype): + subarr = DatetimeIndex([value] * len(index), dtype=dtype) + elif is_categorical_dtype(dtype): + subarr = Categorical([value] * len(index)) + else: + if not isinstance(dtype, (np.dtype, type(np.dtype))): + dtype = dtype.dtype + subarr = np.empty(len(index), dtype=dtype) + subarr.fill(value) + +or + +.. code-block:: python + + if is_categorical_dtype(dtype): + upcast_cls = 'category' + elif is_datetimetz(dtype): + upcast_cls = 'datetimetz' + elif issubclass(dtype.type, np.bool_): + upcast_cls = 'bool' + elif issubclass(dtype.type, np.object_): + upcast_cls = 'object' + elif is_datetime64_dtype(dtype): + upcast_cls = 'datetime' + elif is_timedelta64_dtype(dtype): + upcast_cls = 'timedelta' + else: + upcast_cls = 'float' + +I've cherry-picked one of a number of places where this type of datatype-based +branching happens. + +The primary reason for this complexity is that pandas is using both NumPy's +dtype objects (which describe *physical storage*) as well as its own custom +data type objects as a proxy for pandas's *semantic logical types*. + +Let's step back for a second and come up with clear language to steer the +discussion. + +Some definitions +~~~~~~~~~~~~~~~~ + +Here is my attempt at definitions of some of the key terms: + +* **Metadata**: data that describes other data (such as its in-memory layout) + +* **Semantics**: The meaning / abstract interpretation of something. We often + discuss the semantics (meaning) of computer programs (i.e. what they do, + fundamentally) without touching upon low level details like machine + representation, programming languages, compilers, operating systems, etc. + +* **Physical data (or storage) types**: these are metadata objects which + provide a description of the precise structure of a piece of data in memory. + + * In NumPy, the ``numpy.dtype`` object (aka ``PyArray_Descr`` in the C API) + is metadata describing a single cell / value in an array. Combined with the + ``shape`` and ``strides`` attributes of the ``ndarray`` object, you have + enough information to perform O(1) random access on any cell in an + ``ndarray`` and to assign these values to a C type (or, in the case, of + structured dtypes, assign to a packed C struct). + + * This may or may not include a physical representation of NULL or missing + data (for example: nullable float64 might be a physical type indicating a + normal float64 array along with a bitmap of null/not-null indicators). + +* **Logical data type**: metadata which describes the semantic content of a + single value in an array or other collection of values. Depending on the + logical type, it may map 1-to-1 to a physical type or not at all. Here are + some examples: + + * The ``double`` or ``float64`` type may be viewed both as a logical type as + well as a physical type (a 1-to-1 correspondence). + + * pandas's ``category`` dtype contains its own auxiliary array of category + values (for example, the distinct strings collected from a string + array). Based on the number of categories, the category ``codes`` (which + reference the categories array) are stored in the smallest possible integer + physical type (from ``int8`` to ``int64``, depending whether the data type + can accommodate the codes). For example, if there are 50 codes, the data is + represented in ``int8`` storage. For 1000 codes, it would be ``int16``. + + * Another example: timestamps may be physically stored in ``int64`` + storage, and these values are interpreted in the context of a particular + time unit or resolution (e.g. nanoseconds, milliseconds, seconds). + +In general, new logical types may be formed either by placing new semantics on +top of a single physical data type or some composition of physical or logical +types. For example: you could have a categorical type (a logical construct +consisting of multiple arrays of data) whose categories are some other logical +type. + +For historical reasons, **pandas never developed a clear semantic separation in +its user API between logical and physical data types**. Also, the addition of +new, pandas-only "synthetic" dtypes that are unknown to NumPy (like +categorical, datetimetz, etc.) has expanded this conflation considerably. If +you also consider pandas's custom missing / NULL data behavior, the addition of +ad hoc missing data semantics to a physical NumPy data type created, by the +definitions above, a logical data type (call it ``object[nullable]`` for an +object array) without ever explicitly saying so. + +You might be thinking, "Good job, Wes. You really messed that up!" I'd be +inclined to agree with you now in retrospect, but back in 2011 pandas was not +the super popular project that it is today, and we were truly riding on NumPy's +coat tails. So the extent to which NumPy concepts and APIs were used explicitly +in pandas made the library easier to adopt. Now in 2016, this feels +anachronistic / outdated. + +High-level logical type proposal +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As we have been discussing periodically on the pandas-dev mailing list and +GitHub, I am proposing that we start to unravel our current mess by defining +pandas-specific metadata objects that model the current semantics / behavior of +the project. What does this mean, exactly? + +* Each NumPy dtype object will map 1-to-1 to an equivalent ``pandas.DataType`` + object. +* Existing pandas "extension dtypes" (like ``CategoricalDtype`` and + ``DatetimeTZDtype``), which have been designed to mimic ``numpy.dtype``, will + become logical type subclasses of ``pandas.DataType`` like every other type + in pandas. + +Since pandas is about assisting with data manipulation and analysis, at some +point you must invoke functions that are specialized to the specific physical +memory representation of your data. For example, pandas has its own +implementations of ``ndarray.take`` that are used internally for arrays of +positive integers that may contain NULL / NA values (which are represented as +-1 -- search the codebase for implementations of ``take_1d``). + +The major goals of introducing a logical type abstraction are the follows: + +* Simplifying "dynamic dispatch": invoking the right functions or choosing the + right code branches based on the data type. +* Enabling pandas to decouple both its internal semantics and physical storage + from NumPy's metadata and APIs. Note that this is already happening with + categorical types, since a particular instance ``CategoricalDtype`` may + physically be stored in one of 4 NumPy data types. + +Physical storage decoupling +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By separating pandas data from the presumption of using a particular physical +``numpy.dtype`` internally, we can: + +* Begin to better protect users from NumPy data semantics (which are frequently + different from pandas's!) leaking through to the pandas user API. This can + enable us to address long-standing inconsistencies or "rough edges" in pandas + that have persisted due to our tight semantic coupling to NumPy. + +* We can consider adding new data structures to pandas, either custom to pandas + or provided by 3rd-party libraries, that add new functionality alongside the + existing code (presuming NumPy physical storage). As one concrete example, + discussed in more detail below, we can enable missing data in integer pandas + data by forming a composite data structure consisting of a NumPy array plus a + bitmap marking the null / not-null values. + +Note that neither of these points implies that we are trying to use NumPy +less. We already have large amounts of code that implement algorithms also +found in NumPy (see ``pandas.unique`` or the implementation of ``Series.sum``), +but taking into account pandas's missing data representation, etc. Internally, +we can use NumPy when its computational semantics match those we've chosen for +pandas, and elsewhere we can invoke pandas-specific code. + +A major concern here based on these ideas is **preserving NumPy +interoperability**, so I'll examine this topic in some detail next. + +Preserving NumPy interoperability +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Removal of BlockManager / new DataFrame internals ================================================= @@ -75,9 +262,9 @@ bitmap** (which the user never sees). This has numerous benefits: * 1 byte of memory overhead for each 8 values * Bitmaps can propagate their nulls in C through bitwise ``&`` or ``|`` operations, which are inexpensive. -* Getting and setting bits on modern hardware is very CPU-inexpensive. For - single-pass array operations (like groupbys) on very large arrays this may - also result in better CPU cache utilization (fewer main-memory reads of the +* Getting and setting bits on modern hardware is CPU-inexpensive. For + single-pass array operations (like groupbys) on large arrays this may also + result in better CPU cache utilization (fewer main-memory reads of the bitmap). * Hardware and SIMD "popcount" intrinsics (which can operate on 64-128 bits at a time) can be used to count bits and skip null-handling on segments of data @@ -122,6 +309,9 @@ outside of NumPy's domain will need to be explicitly converted if it is needed in another library that only knows about NumPy. I argue that this is better than the current +Memory accounting +================= + Proper types for strings and some non-numeric data ================================================== @@ -150,7 +340,45 @@ Currently, pandas architecturally is structured as follows: * Algorithms in Cython (more often) or C (less often) to accelerate computationally-intensive algorithms +While it's overall made pandas easier to develop and maintain internally +(perhaps increasingly less so over time!), this has had a number of drawbacks +Microperformance +~~~~~~~~~~~~~~~~ + +Microperformance (operations taking 1 microsecond to 1 millisecond) has +suffered considerably as pandas's internals have expanded to accommodate new +use cases. Fairly simple operations, from indexing to summary statistics, may +pass through multiple layers of scaffolding before hitting the lowest tier of +computations. Let's take for example: + +.. ipython:: python + + s = pd.Series(np.random.randn(100)) + s.sum() + +Profiling ``s.sum()`` with ``%prun`` in IPython, I am seeing 116 function +calls (pandas 0.18.1). Let's look at the microperformance: + +.. code-block:: python + + In [14]: timeit s.sum() + 10000 loops, best of 3: 31.7 µs per loop + + In [15]: v = s.values + + In [16]: timeit v.sum() + 1000000 loops, best of 3: 1.07 µs per loop + +While a slightly contrived example, the internal data structures and function +dispatch machinery add 30 microseconds of overhead. That may not be a +compelling number, but such a method called 1 million times has an additional +30 seconds of overhead. When you consider microperformance in the context of +custom ``groupby`` operations, for example, this may not be so unrealistic. 3rd-party native API (i.e. Cython and C / C++) ============================================== + +Developers of 3rd-party projects (myself included) have often expressed a +desire to be able to inspect, construct, or otherwise manipulate pandas objects +(if even in a limited fashion) in compiled code (Cython, C, or C++). diff --git a/doc/pandas-2.0/source/removals.rst b/doc/pandas-2.0/source/removals.rst index 86c41b2606d36..7ef90c5d65a0d 100644 --- a/doc/pandas-2.0/source/removals.rst +++ b/doc/pandas-2.0/source/removals.rst @@ -7,8 +7,36 @@ Dropping Python 2 support ========================= -Controversional ideas -===================== +With Python 2.7 reaching its supported end-of-life in 2020, like some other +Python projects (e.g. IPython / Jupyter) we should seriously contemplate making +pandas 2.0 only support Python 3.5 and higher. In addition to lowering the +development burden at both the C API and pure Python level, we can also finally +look to take advantage of features (things like ``asyncio``, maybe?) only +available in Python 3. + +Deprecated code to remove +========================= + +Other ideas +=========== + +Here's a collection of other miscellaneous ideas that don't necessarily fit +elsewhere in these documents. + +Column statistics +~~~~~~~~~~~~~~~~~ + +In quite a few pandas algorithms, there are characteristics of the data that +are very useful to know, such as: + +* **Monotonicity**: for comparable data (e.g. numbers), is the data sorted / + strictly increasing? In time series, this permits sorting steps to be + skipped. + +* **Null count**: for data not containing any nulls, the null handling path in + some algorithms can be skipped entirely + + Strided arrays: more trouble than they are worth? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/pandas-2.0/source/strings.rst b/doc/pandas-2.0/source/strings.rst index 110fe644f09e0..aa065cee4844f 100644 --- a/doc/pandas-2.0/source/strings.rst +++ b/doc/pandas-2.0/source/strings.rst @@ -106,12 +106,13 @@ Some benefits of this approach include: Some drawbacks -* Mutating slots in a string array becomes more complex. Whether single value - assignments or put / array-assignment may likely require creating a new - ``data`` buffer. Without a compaction / "garbage collection" step on this - buffer it will be possible to have "dead" memory inside it (for example, if - you did ``arr[:] = 'a-new-string-value'``, all the existing values would be - orphaned). +* This memory layout is best used as an immutable representation. Mutating + slots here becomes more complex. Whether single value assignments or put / + array-assignment may likely require constructing a new ``data`` buffer + (either by ``realloc`` or some other copying mechanism). Without a compaction + / "garbage collection" step on this buffer it will be possible to have "dead" + memory inside it (for example, if you did ``arr[:] = 'a-new-string-value'``, + all the existing values would be orphaned). * Some systems have addressed this issue by storing all string data in a "global string hash table". This is something we could explore, but it @@ -120,8 +121,8 @@ Some drawbacks * Indexing into this data structure to obtain a single Python object will probably want to call ``PyUnicode_FromStringAndSize`` to construct a string - (Python 3, therefore Unicode). This requires a memory allocation, where - currently only a ``Py_INCREF``. + (Python 3, therefore Unicode). This requires a memory allocation, whereas it + currently only has to do a ``Py_INCREF``. * Many of pandas's existing algorithms assuming Python objects would need to be specialized to take advantage of this new memory layout. This is both a pro From c742d5d17487be59731bc5ea175ca3eda721da3b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 8 Aug 2016 17:51:37 -0700 Subject: [PATCH 7/8] Section on numpy interoperability --- doc/pandas-2.0/source/conf.py | 2 +- .../source/internal-architecture.rst | 73 ++++++++++++++++--- 2 files changed, 63 insertions(+), 12 deletions(-) diff --git a/doc/pandas-2.0/source/conf.py b/doc/pandas-2.0/source/conf.py index cf536bfa7d9b6..071c8bd31ea6c 100644 --- a/doc/pandas-2.0/source/conf.py +++ b/doc/pandas-2.0/source/conf.py @@ -229,7 +229,7 @@ # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'pandas20DesignDocs.tex', 'pandas 2.0 Design Docs Documentation', - 'pandas Core Team', 'manual'), + 'Wes McKinney', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/doc/pandas-2.0/source/internal-architecture.rst b/doc/pandas-2.0/source/internal-architecture.rst index 7f51b784db53d..fc224e9bd93b8 100644 --- a/doc/pandas-2.0/source/internal-architecture.rst +++ b/doc/pandas-2.0/source/internal-architecture.rst @@ -17,7 +17,7 @@ Logical types and Physical Storage Decoupling Since this is the most important, but perhaps also most controversial, change (in my opinion) to pandas, I'm going to go over it in great detail. I think the -hardest part of coming up with clear language and definitions for concepts so +hardest part is coming up with clear language and definitions for concepts so that we can communicate effectively. For example the term "data type" is vague and may mean different things to different people. @@ -124,9 +124,9 @@ types. For example: you could have a categorical type (a logical construct consisting of multiple arrays of data) whose categories are some other logical type. -For historical reasons, **pandas never developed a clear semantic separation in -its user API between logical and physical data types**. Also, the addition of -new, pandas-only "synthetic" dtypes that are unknown to NumPy (like +For historical reasons, **pandas never developed a clear or clean semantic +separation in its user API between logical and physical data types**. Also, the +addition of new, pandas-only "synthetic" dtypes that are unknown to NumPy (like categorical, datetimetz, etc.) has expanded this conflation considerably. If you also consider pandas's custom missing / NULL data behavior, the addition of ad hoc missing data semantics to a physical NumPy data type created, by the @@ -168,7 +168,7 @@ The major goals of introducing a logical type abstraction are the follows: right code branches based on the data type. * Enabling pandas to decouple both its internal semantics and physical storage from NumPy's metadata and APIs. Note that this is already happening with - categorical types, since a particular instance ``CategoricalDtype`` may + categorical types, since a particular instance of ``CategoricalDtype`` may physically be stored in one of 4 NumPy data types. Physical storage decoupling @@ -189,12 +189,16 @@ By separating pandas data from the presumption of using a particular physical data by forming a composite data structure consisting of a NumPy array plus a bitmap marking the null / not-null values. +* We can start to think about improved behavior around data ownership (like + copy-on-write) which may yield many benefits. I will write a dedicated + section about this. + Note that neither of these points implies that we are trying to use NumPy -less. We already have large amounts of code that implement algorithms also -found in NumPy (see ``pandas.unique`` or the implementation of ``Series.sum``), -but taking into account pandas's missing data representation, etc. Internally, -we can use NumPy when its computational semantics match those we've chosen for -pandas, and elsewhere we can invoke pandas-specific code. +less. We already have large amounts of code that implement algorithms similar +to those found in NumPy (e.g. ``pandas.unique`` or the implementation of +``Series.sum``), but taking into account pandas's missing data representation, +etc. Internally, we can use NumPy when its computational semantics match those +we've chosen for pandas, and elsewhere we can invoke pandas-specific code. A major concern here based on these ideas is **preserving NumPy interoperability**, so I'll examine this topic in some detail next. @@ -202,6 +206,53 @@ interoperability**, so I'll examine this topic in some detail next. Preserving NumPy interoperability ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Some of types of intended interoperability between NumPy and pandas are as +follows: + +* Users can obtain the a ``numpy.ndarray`` (possibly a view depending on the + internal block structure, more on this soon) in constant time and without + copying the actual data. This has a couple other implications + + * Changes made to this array will be reflected in the source pandas object. + * If you write C extension code (possibly in Cython) and respect pandas's + missing data details, you can invoke certain kinds of fast custom code on + pandas data (but it's somewhat inflexible -- see the latest discussion on + adding a native code API to pandas). + +* NumPy ufuncs (like ``np.sqrt`` or ``np.log``) can be invoked on + pandas objects like Series and DataFrame + +* ``numpy.asarray`` will always yield some array, even if it discards metadata + or has to create a new array. For example ``asarray`` invoked on + ``pandas.Categorical`` yields a reconstructed array (rather than either the + categories or codes internal arrays) + +* Many NumPy methods designed to work on subclasses (or duck-typed classes) of + ``ndarray`` may be used. For example ``numpy.sum`` may be used on a Series + even though it does not invoke NumPy's internal C sum algorithm. This means + that a Series may be used as an interchangeable argument in a large set of + functions that only know about NumPy arrays. + +By and large, I think much of this can be preserved, but there will be some API +breakage. + +If we add more composite data structures (Categorical can be thought of as +one existing composite data structure) to pandas or alternate non-NumPy data +structures, there will be cases where the semantic information in a Series +cannot be adequately represented in a NumPy array. + +As one example, if we add pandas-only missing data support to integer and +boolean data (a long requested feature), calling ``np.asarray`` on such data +may not have well-defined behavior. As present, pandas is implicitly converting +these types to ``float64`` (see more below), which isn't too great. A decision +does not need to be made now, but the benefits of solving this long-standing +issue may merit breaking ``asarray`` as long as we provide an explicit way to +obtain the original casted ``float64`` NumPy array (with ``NaN`` for NULL/NA +values) + +For pandas data that does not step outside NumPy's semantic realm, we can +continue to provide zero-copy views in many cases. + Removal of BlockManager / new DataFrame internals ================================================= @@ -360,7 +411,7 @@ computations. Let's take for example: Profiling ``s.sum()`` with ``%prun`` in IPython, I am seeing 116 function calls (pandas 0.18.1). Let's look at the microperformance: -.. code-block:: python +.. code-block:: text In [14]: timeit s.sum() 10000 loops, best of 3: 31.7 µs per loop From c7819cf806914c5fb9d375d01b19ab850ab405d8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 8 Aug 2016 20:01:37 -0700 Subject: [PATCH 8/8] Exposition on BlockManager / C++ --- doc/pandas-2.0/source/capi.rst | 5 - doc/pandas-2.0/source/index.rst | 1 - .../source/internal-architecture.rst | 339 ++++++++++++++++-- doc/pandas-2.0/source/removals.rst | 4 + 4 files changed, 313 insertions(+), 36 deletions(-) delete mode 100644 doc/pandas-2.0/source/capi.rst diff --git a/doc/pandas-2.0/source/capi.rst b/doc/pandas-2.0/source/capi.rst deleted file mode 100644 index 6c16aee8f6ed2..0000000000000 --- a/doc/pandas-2.0/source/capi.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. _capi: - -============================================= - A native code API (Cython / C++) for pandas -============================================= diff --git a/doc/pandas-2.0/source/index.rst b/doc/pandas-2.0/source/index.rst index 62612a4e596de..70a2c25bbf2b5 100644 --- a/doc/pandas-2.0/source/index.rst +++ b/doc/pandas-2.0/source/index.rst @@ -14,7 +14,6 @@ project in the future. internal-architecture strings copyonwrite - capi removals .. Indices and tables diff --git a/doc/pandas-2.0/source/internal-architecture.rst b/doc/pandas-2.0/source/internal-architecture.rst index fc224e9bd93b8..c3d38e2957aa3 100644 --- a/doc/pandas-2.0/source/internal-architecture.rst +++ b/doc/pandas-2.0/source/internal-architecture.rst @@ -253,12 +253,6 @@ values) For pandas data that does not step outside NumPy's semantic realm, we can continue to provide zero-copy views in many cases. -Removal of BlockManager / new DataFrame internals -================================================= - -``pandas.Array`` and ``pandas.Table`` -===================================== - Missing data consistency ======================== @@ -358,32 +352,137 @@ Tradeoffs One potential downside of the bitmap approach is that missing data implemented outside of NumPy's domain will need to be explicitly converted if it is needed in another library that only knows about NumPy. I argue that this is better -than the current - -Memory accounting -================= - -Proper types for strings and some non-numeric data -================================================== - -I believe that frequently-occurring data types, such as UTF8 strings, are -important enough to deserve a dedicated logical pandas data type. This will -enable us both to enforce tighter API semantics (i.e. attempts to assign a -non-string into string data will be a ``TypeError``) and improved performance -and memory use under the hood. I will devote an entire section to talking about -strings. - -In general, I would be supportive of making Python object (``numpy.object_`` -dtype) arrays the solution only for mixed-type arrays and data types for which -pandas has no native handling. - -Permitting "other" (non-NumPy) data structures -============================================== - +than the current implicit conversion which could yield data loss (for integers +falling outside the exact representable range for ``float64``). +Removal of BlockManager / new DataFrame internals +================================================= -C++11/14 for lowest implementation tier -======================================= +Deep inside the belly pandas objects, there is a data structure called +``BlockManager`` which, at a high level, is responsible for managing the +physical arrays where the data inside a Series or DataFrame is looked +after (also Panel / PanelND structure, even though these are on their way to +deprecation). + +While this data structure has served pandas well since its birth 5 years ago +(Summer 2011), it has a number of problems that make its removal and +replacement with something else an attractive option. + +The goal of this section is to explain what the BlockManager is, why it exists +at all, and why we should consider removing it. + +What is ``BlockManager`` and why does it exist? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The reason that ``BlockManager`` exists at all goes back to some ancient pandas +history. Originally, the data in ``pandas.DataFrame`` was stored in a Python +``dict`` object. If you pull up pandas 0.1 or 0.2, you will see this. + +Since the business logic of pandas's internals was originally implemented in +pure Python, as it is still is (but much larger / more complex), there was a +marked performance difference between column-oriented operations and +row-oriented operations. The reason for this is not really a memory layout +issue (NumPy users know about how contiguous memory access produces much better +performance) so much as a reliance on NumPy's two-dimensional array operations +for carrying out pandas's computations. So, to do anything row oriented on an +all-numeric DataFrame, pandas would concatenate all of the columns together +(using ``numpy.vstack`` or ``numpy.hstack``) then use array broadcasting or +methods like ``ndarray.sum`` (combined with ``np.isnan`` to mind missing data) +to carry out certain operations. + +1. pandas's early users (i.e. AQR employees) beseeched me to address this + performance issue. Thus ``DataMatrix`` was created, a roughly API-equivalent + object whose internal storage was a 2D NumPy array, intended to be of a + homogeneous type (e.g. ``numpy.float64``). The downside of this was that if + you inserted a string column, everything would become ``numpy.object_`` + dtype. Users did not like that. + +2. It had become apparent that the dichotomy between DataFrame and DataMatrix + (and when to use each) was harming pandas's adoption and confusing users. So + I set about creating a hybrid data structure that had "the best of both + worlds". + +3. The idea was that the BlockManager would track collections of NumPy arrays + having the same dtype, particular as columns were inserted or removed + (i.e. the *building* phase of the DataFrame's lifetime). + +4. When you would invoke an operation that benefited from a single + *consolidated* 2-dimensional ndarray of say ``float64`` dtype (for example: + using ``reindex`` or performing a row-oriented operation), the BlockManager + would glue together its accumulated pieces to create a single 2D ndarray of + each data type. This is called **consolidation** in the codebase. + +5. Since in practice, heterogeneous DataFrames had different types interspersed + amongst their columns, the BlockManager maintains a mapping between the + absolute column position and the relative position within the type-specific + 2D "block". + +6. Over time, the BlockManager has been generalized for the 1 through N + dimensional cases, not just the 2D case, so that even Series has a lean + "SingleBlockManager" internally. + +Drawbacks of BlockManager +~~~~~~~~~~~~~~~~~~~~~~~~~ + +While this data structure has enabled pandas to make it this far in life, it +has a number of drawbacks (not a complete list): + +1. **Code complexity**: this has manifested in a number of ways (and probably + others that I'm missing) + + * Making some of the most important algorithms in pandas fast, like joins + and reshape operations, requires carefully constructing the precise block + structure of the output DataFrame so that no further copying or + consolidation will take place. + + * Adding new custom data types to DataFrame and not losing their metadata + (e.g. time zones or categories) has had a sort of "fan out" effect + touching numerous parts of the BlockManager internals. + +2. **Loss of user visibility into memory use and memory layout**: With large + data sets, some "naively" constructed DataFrame objects (e.g. from a dict of + ndarrays) can produce a memory-doubling effect that may cause out-of-memory + errors. Also, consolidated blocks can (depending on the version of pandas) + result in columns having strided / non-contiguous data, resulting in + degraded performance in column-oriented operations. + +3. **Unavoidable consolidation**: Fairly common operations, like ``read_csv``, + may require a consolidation step after completion, which for large data may + result in performance or memory overhead (similar to the above bullet + point). + +4. **Microperformance issues / indexing slowness**: since a DataFrame can be a + sort of many-layered onion, many common pandas operations may weave through + dozens of different functions navigating the structure of the object and + producing the appropriate output. I will talk more about microperformance + later. + +Replacing BlockManager without weakening pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Our goal in replacing BlockManager would be to achieve: + +* Substantially simpler code +* Easier extensibility with new logical types +* Performance on par (or better) the current implementation +* Better user control over memory use and layout +* Improved microperformance + +I believe we can do this, but it's will require a significant inversion of the +internal code architecture to involve a more native code and less interpreted +Python. For example, it will be difficult or impossible to achieve comparable +performance in row-oriented operations (on consolidated DataFrame objects) with +pure Python code. + +In the next section, I will start making my case for creating a "native core" +library where we can assemble the low level data structures, logical types, and +memory management for pandas. Additionally, we would want to port much of +pandas's helper Cython code to live inside this library and operate directly on +the internal data structures rather than being orchestrated from the Python +interpreter level. + +Building "libpandas" in C++11/14 for lowest level implementation tier +===================================================================== Currently, pandas architecturally is structured as follows: @@ -393,6 +492,7 @@ Currently, pandas architecturally is structured as follows: While it's overall made pandas easier to develop and maintain internally (perhaps increasingly less so over time!), this has had a number of drawbacks +as we've discussed. I mentioned microperformance above, so about that: Microperformance ~~~~~~~~~~~~~~~~ @@ -427,9 +527,188 @@ compelling number, but such a method called 1 million times has an additional 30 seconds of overhead. When you consider microperformance in the context of custom ``groupby`` operations, for example, this may not be so unrealistic. +C or C++ (C++11, to be specific)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +At the risk of instigating a religious programming language debate, pandas's +use of Cython in many places is very C++-like: + +* Generic programming through manual code generation (now using tempita) + instead of templates +* Auxiliary types and data structures as ``cdef class`` extension types +* Relying on Python's reference counting for garbage collection and cleanup + after exceptions are raised. The "blend C and Cython" style has aided + developer productivity. + +I argue that judicious and responsible use of modern C++ (and following a +reasonable style guide like `Google's guide +`_, or some slight variation) +will enable us to: + +* Simplify our existing Cython codebase by using templates (and very limited, + template metaprogramming) + +* Easier generic programming / inlining of data-type specific logic at compile + time. + +* Use RAII (exception-safe allocation) and smart pointers (``std::unique_ptr`` + and ``std::shared_ptr``) to simplify memory management + +* Define performant C++ classes modeling the current internals, with various + mechanisms for code reuse or type-specific dynamic dispatch (i.e. through + template classes, CRTP, or simply virtual functions). + +* Use C++11 standard library concurrency tools to more easily create concurrent + / multithreaded implementations of common pandas algorithms. + +By pushing down much of the business logic into C++ (with use of the Python and +NumPy C API where relevant), we'll be able to achieve macroperformance on par +or better than the current BlockManager-based implementation and handily better +microperformance in indexing and simple analytics. + +``pandas.Array`` types +~~~~~~~~~~~~~~~~~~~~~~ + +My gut feeling is that we would want to create relatively simple container +classes having a common ``pandas::Array`` base type in C++, each of which +models a particular logical type. Each array type would have a corresponding +logical type implementation, in the vein of: + +.. code-block:: c++ + + class Array { + // public API omitted + private: + std::shared_ptr type_; + } + + class CategoricalType : public DataType { + // implementation + + private: + std::shared_ptr categories_; + }; + + class CategoricalArray : public Array { + public: + std::shared_ptr codes() const; + std::shared_ptr categories() const; + // rest of implementation omitted + }; + +An array containing a NumPy array will invoke ``Py_DECREF`` in its destructor, +so that after construction one can proceed largely with C++ programming +semantics without much need for manual memory management. + +These Array types would be wrapped and exposed to pandas developers (probably +in Cython). + +Index types +~~~~~~~~~~~ + +Like pandas's current code structure, Index types would be composed from the +Array types and some additional data structures (hash tables) for lookups and +other index operations. These can be similarly exposed to the world via Cython +(and wrapped in a convenient pandas.Index class). + +``pandas.Table`` +~~~~~~~~~~~~~~~~ + +My recommendation is to decommission the BlockManager in favor of a much +simpler low-level Table class, which operates more similarly to an R data.frame +(e.g. no row index). This would look something like + +.. code-block:: c++ + + class Table { + public: + std::shared_ptr GetColumn(int i); + void SetColumn(int i, const std::shared_ptr& arr); + + // rest of public API omitted + private: + // Column index, possibly not necessary + std::shared_ptr columns_; + + // List of arrays + std::vector> data_; + }; + +Operators and dynamic dispatch +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Under this proposed class structure, it may not make sense to add operations as +class methods. We could possibly do something like: + +.. code-block:: c++ + + #include "pandas/dispatch.h" + + // other includes omitted + + using ArrayRef = std::shared_ptr; + + template + inline ArrayRef TakeImpl(U, V) { + // Implementation omitted + } + + ArrayRef Take(ArrayRef values, ArrayRef indices) { + return Dispatch(values, indices); + } + +Here, the Dispatch template would generate the matrix of logical type +combinations, some of which might throw a not implemented exception. + +There's other approaches to dealing with runtime dispatch that don't feature +too much overhead. + +Memory accounting +~~~~~~~~~~~~~~~~~ + +If pandas's internals are encapsulated in C++ classes inside the libpandas core +library, we could atomically track all memory allocations and deallocations to +produce a precise accounting of the number of bytes that pandas has currently +allocated (that are not opaque, so Python objects would only include their +``PyObject**`` array footprint). + +Development toolchain +~~~~~~~~~~~~~~~~~~~~~ + +Introducing C++11 to pandas's development toolchain will add quite a bit of +complexity for developers, especially compared with pandas's current Cython and +C codebase which basically builds out of the box for most people. It would be +better for cross-platform support to use CMake than something else (distutils +doesn't have adequate support for C++). + +Logical types for strings and possibly other non-numeric data +============================================================= + +I believe that frequently-occurring data types, such as UTF8 strings, are +important enough to deserve a dedicated logical pandas data type. This will +enable us both to enforce tighter API semantics (i.e. attempts to assign a +non-string into string data will be a ``TypeError``) and improved performance +and memory use under the hood. I will devote an entire section to talking about +strings. + +In general, I would be supportive of making Python object (``numpy.object_`` +dtype) arrays the solution only for mixed-type arrays and data types for which +pandas has no native handling. + 3rd-party native API (i.e. Cython and C / C++) ============================================== Developers of 3rd-party projects (myself included) have often expressed a desire to be able to inspect, construct, or otherwise manipulate pandas objects (if even in a limited fashion) in compiled code (Cython, C, or C++). + +Per the discussion of libpandas and a native core, I would propose the +following: + +* Define public-facing ``.pxd`` files that allow developers to use ``cimport`` + and get access to pandas's internal extension types. +* Define factory function that enable fully formed Series and DataFrame objects + to be constructed either by Cython API calls or potentially also C++ + libpandas API calls. +* Provide Cython APIs for 3rd-party developers to obtain pointers to access the + underlying C++ objects contained in the wrapper Python objects diff --git a/doc/pandas-2.0/source/removals.rst b/doc/pandas-2.0/source/removals.rst index 7ef90c5d65a0d..5f10485b31405 100644 --- a/doc/pandas-2.0/source/removals.rst +++ b/doc/pandas-2.0/source/removals.rst @@ -17,6 +17,10 @@ available in Python 3. Deprecated code to remove ========================= +* ``.ix`` indexing entirely +* ``Panel`` and ``PanelND`` classes +* Plotting? + Other ideas ===========