diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index ccefb6a15..18e680ca3 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -18,6 +18,7 @@ It is mainly dedicated to help new users to familiarize with it and others to re ./tutorial/tutorial_transforming.ipynb ./tutorial/tutorial_indexing.ipynb ./tutorial/tutorial_arithmetic_op_and_aggregation.ipynb + ./tutorial/tutorial_string_syntax.ipynb ./tutorial/tutorial_plotting.ipynb ./tutorial/tutorial_miscellaneous.ipynb ./tutorial/tutorial_sessions.ipynb diff --git a/doc/source/tutorial/tutorial_IO.ipyml b/doc/source/tutorial/tutorial_IO.ipyml index 78967e63c..23296c9ac 100644 --- a/doc/source/tutorial/tutorial_IO.ipyml +++ b/doc/source/tutorial/tutorial_IO.ipyml @@ -523,7 +523,7 @@ cells: - code: | - read_hdf(filepath_hdf, key='deaths', sort_rows=True) + read_hdf(filepath_hdf, key='deaths').sort_axes() - markdown: | @@ -574,17 +574,17 @@ cells: - code: | # create a new Session object and load all arrays, axes, groups and metadata # from all CSV files located in the passed directory - csv_dir = get_example_filepath('population_session') + csv_dir = get_example_filepath('demography_eurostat') session = Session(csv_dir) # create a new Session object and load all arrays, axes, groups and metadata # stored in the passed Excel file - filepath_excel = get_example_filepath('population_session.xlsx') + filepath_excel = get_example_filepath('demography_eurostat.xlsx') session = Session(filepath_excel) # create a new Session object and load all arrays, axes, groups and metadata # stored in the passed HDF5 file - filepath_hdf = get_example_filepath('population_session.h5') + filepath_hdf = get_example_filepath('demography_eurostat.h5') session = Session(filepath_hdf) print(session.summary()) @@ -728,7 +728,7 @@ metadata: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.8 + version: 3.7.3 livereveal: autolaunch: false scroll: true diff --git a/doc/source/tutorial/tutorial_IO.ipynb b/doc/source/tutorial/tutorial_IO.ipynb index d51b619fe..6d9225e23 100644 --- a/doc/source/tutorial/tutorial_IO.ipynb +++ b/doc/source/tutorial/tutorial_IO.ipynb @@ -1,1066 +1,1066 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load And Dump Arrays, Sessions, Axes And Groups\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "LArray provides methods and functions to load and dump LArray, Session, Axis Group objects to several formats such as Excel, CSV and HDF5. The HDF5 file format is designed to store and organize large amounts of data. It allows to read and write data much faster than when working with CSV and Excel files. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "# run this cell to avoid annoying warnings\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\", message=r'.*numpy.dtype size changed*')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# first of all, import the LArray library\n", - "from larray import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check the version of LArray:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from larray import __version__\n", - "__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading and Dumping Arrays\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading Arrays - Basic Usage (CSV, Excel, HDF5)\n", - "\n", - "To read an array from a CSV file, you must use the ``read_csv`` function:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "csv_dir = get_example_filepath('examples')\n", - "\n", - "# read the array pop from the file 'pop.csv'.\n", - "# The data of the array below is derived from a subset of the demo_pjan table from Eurostat\n", - "pop = read_csv(csv_dir + '/pop.csv')\n", - "pop" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To read an array from a sheet of an Excel file, you can use the ``read_excel`` function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "filepath_excel = get_example_filepath('examples.xlsx')\n", - "\n", - "# read the array from the sheet 'births' of the Excel file 'examples.xlsx'\n", - "# The data of the array below is derived from a subset of the demo_fasec table from Eurostat\n", - "births = read_excel(filepath_excel, 'births')\n", - "births" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The ``open_excel`` function in combination with the ``load`` method allows you to load several arrays from the same Workbook without opening and closing it several times:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "# open the Excel file 'population.xlsx' and let it opened as long as you keep the indent.\n", - "# The Python keyword ``with`` ensures that the Excel file is properly closed even if an error occurs\n", - "with open_excel(filepath_excel) as wb:\n", - " # load the array 'pop' from the sheet 'pop' \n", - " pop = wb['pop'].load()\n", - " # load the array 'births' from the sheet 'births'\n", - " births = wb['births'].load()\n", - " # load the array 'deaths' from the sheet 'deaths'\n", - " deaths = wb['deaths'].load()\n", - "\n", - "# the Workbook is automatically closed when getting out the block defined by the with statement\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - " **Warning:** `open_excel` requires to work on Windows and to have the library ``xlwings`` installed.\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `HDF5` file format is specifically designed to store and organize large amounts of data. \n", - "Reading and writing data in this file format is much faster than with CSV or Excel. \n", - "An HDF5 file can contain multiple arrays, each array being associated with a key.\n", - "To read an array from an HDF5 file, you must use the ``read_hdf`` function and provide the key associated with the array:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "filepath_hdf = get_example_filepath('examples.h5')\n", - "\n", - "# read the array from the file 'examples.h5' associated with the key 'deaths'\n", - "# The data of the array below is derived from a subset of the demo_magec table from Eurostat\n", - "deaths = read_hdf(filepath_hdf, 'deaths')\n", - "deaths" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dumping Arrays - Basic Usage (CSV, Excel, HDF5)\n", - "\n", - "To write an array in a CSV file, you must use the ``to_csv`` method:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# save the array pop in the file 'pop.csv'\n", - "pop.to_csv('pop.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To write an array to a sheet of an Excel file, you can use the ``to_excel`` method:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# save the array pop in the sheet 'pop' of the Excel file 'population.xlsx' \n", - "pop.to_excel('population.xlsx', 'pop')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that ``to_excel`` create a new Excel file if it does not exist yet. \n", - "If the file already exists, a new sheet is added after the existing ones if that sheet does not already exists:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add a new sheet 'births' to the file 'population.xlsx' and save the array births in it\n", - "births.to_excel('population.xlsx', 'births')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To reset an Excel file, you simply need to set the `overwrite_file` argument as True:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 1. reset the file 'population.xlsx' (all sheets are removed)\n", - "# 2. create a sheet 'pop' and save the array pop in it\n", - "pop.to_excel('population.xlsx', 'pop', overwrite_file=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The ``open_excel`` function in combination with the ``dump()`` method allows you to open a Workbook and to export several arrays at once. If the Excel file doesn't exist, the ``overwrite_file`` argument must be set to True.\n", - "\n", - "
\n", - " **Warning:** The ``save`` method must be called at the end of the block defined by the *with* statement to actually write data in the Excel file, otherwise you will end up with an empty file.\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "# to create a new Excel file, argument overwrite_file must be set to True\n", - "with open_excel('population.xlsx', overwrite_file=True) as wb:\n", - " # add a new sheet 'pop' and dump the array pop in it \n", - " wb['pop'] = pop.dump()\n", - " # add a new sheet 'births' and dump the array births in it \n", - " wb['births'] = births.dump()\n", - " # add a new sheet 'deaths' and dump the array deaths in it \n", - " wb['deaths'] = deaths.dump()\n", - " # actually write data in the Workbook\n", - " wb.save()\n", - " \n", - "# the Workbook is automatically closed when getting out the block defined by the with statement\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To write an array in an HDF5 file, you must use the ``to_hdf`` function and provide the key that will be associated with the array:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# save the array pop in the file 'population.h5' and associate it with the key 'pop'\n", - "pop.to_hdf('population.h5', 'pop')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Specifying Wide VS Narrow format (CSV, Excel)\n", - "\n", - "By default, all reading functions assume that arrays are stored in the ``wide`` format, meaning that their last axis is represented horizontally:\n", - "\n", - "| country \\\\ time | 2013 | 2014 | 2015 |\n", - "| --------------- | -------- | -------- | -------- |\n", - "| Belgium | 11137974 | 11180840 | 11237274 |\n", - "| France | 65600350 | 65942267 | 66456279 |\n", - "\n", - "By setting the ``wide`` argument to False, reading functions will assume instead that arrays are stored in the ``narrow`` format, i.e. one column per axis plus one value column:\n", - "\n", - "| country | time | value |\n", - "| ------- | ---- | -------- |\n", - "| Belgium | 2013 | 11137974 |\n", - "| Belgium | 2014 | 11180840 |\n", - "| Belgium | 2015 | 11237274 |\n", - "| France | 2013 | 65600350 |\n", - "| France | 2014 | 65942267 |\n", - "| France | 2015 | 66456279 |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# set 'wide' argument to False to indicate that the array is stored in the 'narrow' format\n", - "pop_BE_FR = read_csv(csv_dir + '/pop_narrow_format.csv', wide=False)\n", - "pop_BE_FR" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# same for the read_excel function\n", - "pop_BE_FR = read_excel(filepath_excel, sheet='pop_narrow_format', wide=False)\n", - "pop_BE_FR" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, writing functions will set the name of the column containing the data to 'value'. You can choose the name of this column by using the ``value_name`` argument. For example, using ``value_name='population'`` you can export the previous array as:\n", - "\n", - "| country | time | population |\n", - "| ------- | ---- | ---------- |\n", - "| Belgium | 2013 | 11137974 |\n", - "| Belgium | 2014 | 11180840 |\n", - "| Belgium | 2015 | 11237274 |\n", - "| France | 2013 | 65600350 |\n", - "| France | 2014 | 65942267 |\n", - "| France | 2015 | 66456279 |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# dump the array pop_BE_FR in a narrow format (one column per axis plus one value column).\n", - "# By default, the name of the column containing data is set to 'value'\n", - "pop_BE_FR.to_csv('pop_narrow_format.csv', wide=False)\n", - "\n", - "# same but replace 'value' by 'population'\n", - "pop_BE_FR.to_csv('pop_narrow_format.csv', wide=False, value_name='population')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# same for the to_excel method\n", - "pop_BE_FR.to_excel('population.xlsx', 'pop_narrow_format', wide=False, value_name='population')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Like with the ``to_excel`` method, it is possible to export arrays in a ``narrow`` format using ``open_excel``. \n", - "To do so, you must set the ``wide`` argument of the ``dump`` method to False:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "with open_excel('population.xlsx') as wb:\n", - " # dump the array pop_BE_FR in a narrow format: \n", - " # one column per axis plus one value column.\n", - " # Argument value_name can be used to change the name of the \n", - " # column containing the data (default name is 'value')\n", - " wb['pop_narrow_format'] = pop_BE_FR.dump(wide=False, value_name='population')\n", - " # don't forget to call save()\n", - " wb.save()\n", - "\n", - "# in the sheet 'pop_narrow_format', data is written as:\n", - "# | country | time | value |\n", - "# | ------- | ---- | -------- |\n", - "# | Belgium | 2013 | 11137974 |\n", - "# | Belgium | 2014 | 11180840 |\n", - "# | Belgium | 2015 | 11237274 |\n", - "# | France | 2013 | 65600350 |\n", - "# | France | 2014 | 65942267 |\n", - "# | France | 2015 | 66456279 |\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Specifying Position in Sheet (Excel)\n", - "\n", - "If you want to read an array from an Excel sheet which does not start at cell `A1` (when there is more than one array stored in the same sheet for example), you will need to use the ``range`` argument. \n", - "\n", - "
\n", - " **Warning:** Note that the ``range`` argument is only available if you have the library ``xlwings`` installed (Windows).\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "# the 'range' argument must be used to load data not starting at cell A1.\n", - "# This is useful when there is several arrays stored in the same sheet\n", - "births = read_excel(filepath_excel, sheet='pop_births_deaths', range='A9:E15')\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using ``open_excel``, ranges are passed in brackets:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "with open_excel(filepath_excel) as wb:\n", - " # store sheet 'pop_births_deaths' in a temporary variable sh\n", - " sh = wb['pop_births_deaths']\n", - " # load the array pop from range A1:E7\n", - " pop = sh['A1:E7'].load()\n", - " # load the array births from range A9:E15\n", - " births = sh['A9:E15'].load()\n", - " # load the array deaths from range A17:E23\n", - " deaths = sh['A17:E23'].load()\n", - "\n", - "# the Workbook is automatically closed when getting out the block defined by the with statement\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When exporting arrays to Excel files, data is written starting at cell `A1` by default. Using the ``position`` argument of the ``to_excel`` method, it is possible to specify the top left cell of the dumped data. This can be useful when you want to export several arrays in the same sheet for example\n", - "\n", - "
\n", - " **Warning:** Note that the ``position`` argument is only available if you have the library ``xlwings`` installed (Windows).\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "filename = 'population.xlsx'\n", - "sheetname = 'pop_births_deaths'\n", - "\n", - "# save the arrays pop, births and deaths in the same sheet 'pop_births_and_deaths'.\n", - "# The 'position' argument is used to shift the location of the second and third arrays to be dumped\n", - "pop.to_excel(filename, sheetname)\n", - "births.to_excel(filename, sheetname, position='A9')\n", - "deaths.to_excel(filename, sheetname, position='A17')\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using ``open_excel``, the position is passed in brackets (this allows you to also add extra informations): \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "with open_excel('population.xlsx') as wb:\n", - " # add a new sheet 'pop_births_deaths' and write 'population' in the first cell\n", - " # note: you can use wb['new_sheet_name'] = '' to create an empty sheet\n", - " wb['pop_births_deaths'] = 'population'\n", - " # store sheet 'pop_births_deaths' in a temporary variable sh\n", - " sh = wb['pop_births_deaths']\n", - " # dump the array pop in sheet 'pop_births_deaths' starting at cell A2\n", - " sh['A2'] = pop.dump()\n", - " # add 'births' in cell A10\n", - " sh['A10'] = 'births'\n", - " # dump the array births in sheet 'pop_births_deaths' starting at cell A11 \n", - " sh['A11'] = births.dump()\n", - " # add 'deaths' in cell A19\n", - " sh['A19'] = 'deaths'\n", - " # dump the array deaths in sheet 'pop_births_deaths' starting at cell A20\n", - " sh['A20'] = deaths.dump()\n", - " # don't forget to call save()\n", - " wb.save()\n", - " \n", - "# the Workbook is automatically closed when getting out the block defined by the with statement\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exporting data without headers (Excel)\n", - "\n", - "For some reasons, you may want to export only the data of an array without axes. For example, you may want to insert a new column containing extra information. As an exercise, let us consider we want to add the capital city for each country present in the array containing the total population by country:\n", - "\n", - "| country | capital city | 2013 | 2014 | 2015 |\n", - "| ------- | ------------ | -------- | -------- | -------- |\n", - "| Belgium | Brussels | 11137974 | 11180840 | 11237274 |\n", - "| France | Paris | 65600350 | 65942267 | 66456279 |\n", - "| Germany | Berlin | 80523746 | 80767463 | 81197537 |\n", - "\n", - "Assuming you have prepared an excel sheet as below: \n", - "\n", - "| country | capital city | 2013 | 2014 | 2015 |\n", - "| ------- | ------------ | -------- | -------- | -------- |\n", - "| Belgium | Brussels | | | |\n", - "| France | Paris | | | |\n", - "| Germany | Berlin | | | ||\n", - "\n", - "you can then dump the data at right place by setting the ``header`` argument of ``to_excel`` to False and specifying the position of the data in sheet:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "pop_by_country = pop.sum('gender')\n", - "\n", - "# export only the data of the array pop_by_country starting at cell C2\n", - "pop_by_country.to_excel('population.xlsx', 'pop_by_country', header=False, position='C2')\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using ``open_excel``, you can easily prepare the sheet and then export only data at the right place by either setting the ``header`` argument of the ``dump`` method to False or avoiding to call ``dump``:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "with open_excel('population.xlsx') as wb:\n", - " # create new empty sheet 'pop_by_country'\n", - " wb['pop_by_country'] = ''\n", - " # store sheet 'pop_by_country' in a temporary variable sh\n", - " sh = wb['pop_by_country']\n", - " # write extra information (description)\n", - " sh['A1'] = 'Population at 1st January by country'\n", - " # export column names\n", - " sh['A2'] = ['country', 'capital city']\n", - " sh['C2'] = pop_by_country.time.labels\n", - " # export countries as first column\n", - " sh['A3'].options(transpose=True).value = pop_by_country.country.labels\n", - " # export capital cities as second column\n", - " sh['B3'].options(transpose=True).value = ['Brussels', 'Paris', 'Berlin']\n", - " # export only data of pop_by_country\n", - " sh['C3'] = pop_by_country.dump(header=False)\n", - " # or equivalently\n", - " sh['C3'] = pop_by_country\n", - " # don't forget to call save()\n", - " wb.save()\n", - " \n", - "# the Workbook is automatically closed when getting out the block defined by the with statement\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Specifying the Number of Axes at Reading (CSV, Excel)\n", - "\n", - "By default, ``read_csv`` and ``read_excel`` will search the position of the first cell containing the special character ``\\`` in the header line in order to determine the number of axes of the array to read. The special character ``\\`` is used to separate the name of the two last axes. If there is no special character ``\\``, ``read_csv`` and ``read_excel`` will consider that the array to read has only one dimension. For an array stored as:\n", - "\n", - "| country | gender \\\\ time | 2013 | 2014 | 2015 |\n", - "| ------- | -------------- | -------- | -------- | -------- |\n", - "| Belgium | Male | 5472856 | 5493792 | 5524068 |\n", - "| Belgium | Female | 5665118 | 5687048 | 5713206 |\n", - "| France | Male | 31772665 | 31936596 | 32175328 |\n", - "| France | Female | 33827685 | 34005671 | 34280951 |\n", - "| Germany | Male | 39380976 | 39556923 | 39835457 |\n", - "| Germany | Female | 41142770 | 41210540 | 41362080 |\n", - "\n", - "``read_csv`` and ``read_excel`` will find the special character ``\\`` in the second cell meaning it expects three axes (country, gender and time). \n", - "\n", - "Sometimes, you need to read an array for which the name of the last axis is implicit: \n", - "\n", - "| country | gender | 2013 | 2014 | 2015 |\n", - "| ------- | ------ | -------- | -------- | -------- |\n", - "| Belgium | Male | 5472856 | 5493792 | 5524068 |\n", - "| Belgium | Female | 5665118 | 5687048 | 5713206 |\n", - "| France | Male | 31772665 | 31936596 | 32175328 |\n", - "| France | Female | 33827685 | 34005671 | 34280951 |\n", - "| Germany | Male | 39380976 | 39556923 | 39835457 |\n", - "| Germany | Female | 41142770 | 41210540 | 41362080 |\n", - "\n", - "For such case, you will have to inform ``read_csv`` and ``read_excel`` of the number of axes of the output array by setting the ``nb_axes`` argument:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# read the 3 x 2 x 3 array stored in the file 'pop_missing_axis_name.csv' wihout using 'nb_axes' argument.\n", - "pop = read_csv(csv_dir + '/pop_missing_axis_name.csv')\n", - "# shape and data type of the output array are not what we expected\n", - "pop.info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# by setting the 'nb_axes' argument, you can indicate to read_csv the number of axes of the output array\n", - "pop = read_csv(csv_dir + '/pop_missing_axis_name.csv', nb_axes=3)\n", - "\n", - "# give a name to the last axis\n", - "pop = pop.rename(-1, 'time')\n", - "\n", - "# shape and data type of the output array are what we expected\n", - "pop.info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# same for the read_excel function\n", - "pop = read_excel(filepath_excel, sheet='pop_missing_axis_name', nb_axes=3)\n", - "pop = pop.rename(-1, 'time')\n", - "pop.info" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### NaNs and Missing Data Handling at Reading (CSV, Excel)\n", - "\n", - "Sometimes, there is no data available for some label combinations. In the example below, the rows corresponding to `France - Male` and `Germany - Female` are missing:\n", - "\n", - "| country | gender \\\\ time | 2013 | 2014 | 2015 |\n", - "| ------- | -------------- | -------- | -------- | -------- |\n", - "| Belgium | Male | 5472856 | 5493792 | 5524068 |\n", - "| Belgium | Female | 5665118 | 5687048 | 5713206 |\n", - "| France | Female | 33827685 | 34005671 | 34280951 |\n", - "| Germany | Male | 39380976 | 39556923 | 39835457 |\n", - "\n", - "By default, ``read_csv`` and ``read_excel`` will fill cells associated with missing label combinations with nans. \n", - "Be aware that, in that case, an int array will be converted to a float array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# by default, cells associated will missing label combinations are filled with nans.\n", - "# In that case, the output array is converted to a float array\n", - "read_csv(csv_dir + '/pop_missing_values.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, it is possible to choose which value to use to fill missing cells using the ``fill_value`` argument:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "read_csv(csv_dir + '/pop_missing_values.csv', fill_value=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# same for the read_excel function\n", - "read_excel(filepath_excel, sheet='pop_missing_values', fill_value=0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sorting Axes at Reading (CSV, Excel, HDF5)\n", - "\n", - "The ``sort_rows`` and ``sort_columns`` arguments of the reading functions allows you to sort rows and columns alphabetically:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# sort labels at reading --> Male and Female labels are inverted\n", - "read_csv(csv_dir + '/pop.csv', sort_rows=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "read_excel(filepath_excel, sheet='births', sort_rows=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "read_hdf(filepath_hdf, key='deaths', sort_rows=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Metadata (HDF5)\n", - "\n", - "Since the version 0.29 of LArray, it is possible to add metadata to arrays:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pop.meta.title = 'Population at 1st January'\n", - "pop.meta.origin = 'Table demo_jpan from Eurostat'\n", - "\n", - "pop.info" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These metadata are automatically saved and loaded when working with the HDF5 file format: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pop.to_hdf('population.h5', 'pop')\n", - "\n", - "new_pop = read_hdf('population.h5', 'pop')\n", - "new_pop.info" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - " **Warning:** Currently, metadata associated with arrays cannot be saved and loaded when working with CSV and Excel files.\n", - " This restriction does not apply however to metadata associated with sessions.\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading and Dumping Sessions\n", - "\n", - "One of the main advantages of grouping arrays, axes and groups in session objects is that you can load and save all of them in one shot. Like arrays, it is possible to associate metadata to a session. These can be saved and loaded in all file formats. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading Sessions (CSV, Excel, HDF5)\n", - "\n", - "To load the items of a session, you have two options:\n", - "\n", - "1) Instantiate a new session and pass the path to the Excel/HDF5 file or to the directory containing CSV files to the Session constructor:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a new Session object and load all arrays, axes, groups and metadata \n", - "# from all CSV files located in the passed directory\n", - "csv_dir = get_example_filepath('population_session')\n", - "session = Session(csv_dir)\n", - "\n", - "# create a new Session object and load all arrays, axes, groups and metadata\n", - "# stored in the passed Excel file\n", - "filepath_excel = get_example_filepath('population_session.xlsx')\n", - "session = Session(filepath_excel)\n", - "\n", - "# create a new Session object and load all arrays, axes, groups and metadata\n", - "# stored in the passed HDF5 file\n", - "filepath_hdf = get_example_filepath('population_session.h5')\n", - "session = Session(filepath_hdf)\n", - "\n", - "print(session.summary())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2) Call the ``load`` method on an existing session and pass the path to the Excel/HDF5 file or to the directory containing CSV files as first argument:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a session containing 3 axes, 2 groups and one array 'pop'\n", - "filepath = get_example_filepath('pop_only.xlsx')\n", - "session = Session(filepath)\n", - "\n", - "print(session.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# call the load method on the previous session and add the 'births' and 'deaths' arrays to it\n", - "filepath = get_example_filepath('births_and_deaths.xlsx')\n", - "session.load(filepath)\n", - "\n", - "print(session.summary())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The ``load`` method offers some options:\n", - "\n", - "1) Using the ``names`` argument, you can specify which items to load:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "session = Session()\n", - "\n", - "# use the names argument to only load births and deaths arrays\n", - "session.load(filepath_hdf, names=['births', 'deaths'])\n", - "\n", - "print(session.summary())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2) Setting the ``display`` argument to True, the ``load`` method will print a message each time a new item is loaded: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "session = Session()\n", - "\n", - "# with display=True, the load method will print a message\n", - "# each time a new item is loaded\n", - "session.load(filepath_hdf, display=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dumping Sessions (CSV, Excel, HDF5)\n", - "\n", - "To save a session, you need to call the ``save`` method. The first argument is the path to a Excel/HDF5 file or to a directory if items are saved to CSV files:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# save items of a session in CSV files.\n", - "# Here, the save method will create a 'population' directory in which CSV files will be written \n", - "session.save('population')\n", - "\n", - "# save session to an HDF5 file\n", - "session.save('population.h5')\n", - "\n", - "# save session to an Excel file\n", - "session.save('population.xlsx')\n", - "\n", - "# load session saved in 'population.h5' to see its content\n", - "Session('population.h5')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - " Note: Concerning the CSV and Excel formats: \n", - " \n", - " - all Axis objects are saved together in the same Excel sheet (CSV file) named `__axes__(.csv)` \n", - " - all Group objects are saved together in the same Excel sheet (CSV file) named `__groups__(.csv)` \n", - " - metadata is saved in one Excel sheet (CSV file) named `__metadata__(.csv)` \n", - " \n", - " These sheet (CSV file) names cannot be changed. \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The ``save`` method has several arguments:\n", - "\n", - "1) Using the ``names`` argument, you can specify which items to save:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use the names argument to only save births and deaths arrays\n", - "session.save('population.h5', names=['births', 'deaths'])\n", - "\n", - "# load session saved in 'population.h5' to see its content\n", - "Session('population.h5')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2) By default, dumping a session to an Excel or HDF5 file will overwrite it. By setting the ``overwrite`` argument to False, you can choose to update the existing Excel or HDF5 file: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pop = read_csv('./population/pop.csv')\n", - "ses_pop = Session([('pop', pop)])\n", - "\n", - "# by setting overwrite to False, the destination file is updated instead of overwritten.\n", - "# The items already stored in the file but not present in the session are left intact. \n", - "# On the contrary, the items that exist in both the file and the session are completely overwritten.\n", - "ses_pop.save('population.h5', overwrite=False)\n", - "\n", - "# load session saved in 'population.h5' to see its content\n", - "Session('population.h5')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "3) Setting the ``display`` argument to True, the ``save`` method will print a message each time an item is dumped: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# with display=True, the save method will print a message\n", - "# each time an item is dumped\n", - "session.save('population.h5', display=True)" - ] - } - ], - "metadata": { - "celltoolbar": "Edit Metadata", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "livereveal": { - "autolaunch": false, - "scroll": true - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load And Dump Arrays, Sessions, Axes And Groups\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "LArray provides methods and functions to load and dump LArray, Session, Axis Group objects to several formats such as Excel, CSV and HDF5. The HDF5 file format is designed to store and organize large amounts of data. It allows to read and write data much faster than when working with CSV and Excel files. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# run this cell to avoid annoying warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\", message=r'.*numpy.dtype size changed*')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# first of all, import the LArray library\n", + "from larray import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the version of LArray:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from larray import __version__\n", + "__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading and Dumping Arrays\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading Arrays - Basic Usage (CSV, Excel, HDF5)\n", + "\n", + "To read an array from a CSV file, you must use the ``read_csv`` function:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "csv_dir = get_example_filepath('examples')\n", + "\n", + "# read the array pop from the file 'pop.csv'.\n", + "# The data of the array below is derived from a subset of the demo_pjan table from Eurostat\n", + "pop = read_csv(csv_dir + '/pop.csv')\n", + "pop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To read an array from a sheet of an Excel file, you can use the ``read_excel`` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filepath_excel = get_example_filepath('examples.xlsx')\n", + "\n", + "# read the array from the sheet 'births' of the Excel file 'examples.xlsx'\n", + "# The data of the array below is derived from a subset of the demo_fasec table from Eurostat\n", + "births = read_excel(filepath_excel, 'births')\n", + "births" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ``open_excel`` function in combination with the ``load`` method allows you to load several arrays from the same Workbook without opening and closing it several times:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "# open the Excel file 'population.xlsx' and let it opened as long as you keep the indent.\n", + "# The Python keyword ``with`` ensures that the Excel file is properly closed even if an error occurs\n", + "with open_excel(filepath_excel) as wb:\n", + " # load the array 'pop' from the sheet 'pop' \n", + " pop = wb['pop'].load()\n", + " # load the array 'births' from the sheet 'births'\n", + " births = wb['births'].load()\n", + " # load the array 'deaths' from the sheet 'deaths'\n", + " deaths = wb['deaths'].load()\n", + "\n", + "# the Workbook is automatically closed when getting out the block defined by the with statement\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " **Warning:** `open_excel` requires to work on Windows and to have the library ``xlwings`` installed.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `HDF5` file format is specifically designed to store and organize large amounts of data. \n", + "Reading and writing data in this file format is much faster than with CSV or Excel. \n", + "An HDF5 file can contain multiple arrays, each array being associated with a key.\n", + "To read an array from an HDF5 file, you must use the ``read_hdf`` function and provide the key associated with the array:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filepath_hdf = get_example_filepath('examples.h5')\n", + "\n", + "# read the array from the file 'examples.h5' associated with the key 'deaths'\n", + "# The data of the array below is derived from a subset of the demo_magec table from Eurostat\n", + "deaths = read_hdf(filepath_hdf, 'deaths')\n", + "deaths" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dumping Arrays - Basic Usage (CSV, Excel, HDF5)\n", + "\n", + "To write an array in a CSV file, you must use the ``to_csv`` method:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save the array pop in the file 'pop.csv'\n", + "pop.to_csv('pop.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To write an array to a sheet of an Excel file, you can use the ``to_excel`` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save the array pop in the sheet 'pop' of the Excel file 'population.xlsx' \n", + "pop.to_excel('population.xlsx', 'pop')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that ``to_excel`` create a new Excel file if it does not exist yet. \n", + "If the file already exists, a new sheet is added after the existing ones if that sheet does not already exists:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add a new sheet 'births' to the file 'population.xlsx' and save the array births in it\n", + "births.to_excel('population.xlsx', 'births')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To reset an Excel file, you simply need to set the `overwrite_file` argument as True:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. reset the file 'population.xlsx' (all sheets are removed)\n", + "# 2. create a sheet 'pop' and save the array pop in it\n", + "pop.to_excel('population.xlsx', 'pop', overwrite_file=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ``open_excel`` function in combination with the ``dump()`` method allows you to open a Workbook and to export several arrays at once. If the Excel file doesn't exist, the ``overwrite_file`` argument must be set to True.\n", + "\n", + "
\n", + " **Warning:** The ``save`` method must be called at the end of the block defined by the *with* statement to actually write data in the Excel file, otherwise you will end up with an empty file.\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "# to create a new Excel file, argument overwrite_file must be set to True\n", + "with open_excel('population.xlsx', overwrite_file=True) as wb:\n", + " # add a new sheet 'pop' and dump the array pop in it \n", + " wb['pop'] = pop.dump()\n", + " # add a new sheet 'births' and dump the array births in it \n", + " wb['births'] = births.dump()\n", + " # add a new sheet 'deaths' and dump the array deaths in it \n", + " wb['deaths'] = deaths.dump()\n", + " # actually write data in the Workbook\n", + " wb.save()\n", + " \n", + "# the Workbook is automatically closed when getting out the block defined by the with statement\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To write an array in an HDF5 file, you must use the ``to_hdf`` function and provide the key that will be associated with the array:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save the array pop in the file 'population.h5' and associate it with the key 'pop'\n", + "pop.to_hdf('population.h5', 'pop')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Specifying Wide VS Narrow format (CSV, Excel)\n", + "\n", + "By default, all reading functions assume that arrays are stored in the ``wide`` format, meaning that their last axis is represented horizontally:\n", + "\n", + "| country \\\\ time | 2013 | 2014 | 2015 |\n", + "| --------------- | -------- | -------- | -------- |\n", + "| Belgium | 11137974 | 11180840 | 11237274 |\n", + "| France | 65600350 | 65942267 | 66456279 |\n", + "\n", + "By setting the ``wide`` argument to False, reading functions will assume instead that arrays are stored in the ``narrow`` format, i.e. one column per axis plus one value column:\n", + "\n", + "| country | time | value |\n", + "| ------- | ---- | -------- |\n", + "| Belgium | 2013 | 11137974 |\n", + "| Belgium | 2014 | 11180840 |\n", + "| Belgium | 2015 | 11237274 |\n", + "| France | 2013 | 65600350 |\n", + "| France | 2014 | 65942267 |\n", + "| France | 2015 | 66456279 |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set 'wide' argument to False to indicate that the array is stored in the 'narrow' format\n", + "pop_BE_FR = read_csv(csv_dir + '/pop_narrow_format.csv', wide=False)\n", + "pop_BE_FR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# same for the read_excel function\n", + "pop_BE_FR = read_excel(filepath_excel, sheet='pop_narrow_format', wide=False)\n", + "pop_BE_FR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, writing functions will set the name of the column containing the data to 'value'. You can choose the name of this column by using the ``value_name`` argument. For example, using ``value_name='population'`` you can export the previous array as:\n", + "\n", + "| country | time | population |\n", + "| ------- | ---- | ---------- |\n", + "| Belgium | 2013 | 11137974 |\n", + "| Belgium | 2014 | 11180840 |\n", + "| Belgium | 2015 | 11237274 |\n", + "| France | 2013 | 65600350 |\n", + "| France | 2014 | 65942267 |\n", + "| France | 2015 | 66456279 |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# dump the array pop_BE_FR in a narrow format (one column per axis plus one value column).\n", + "# By default, the name of the column containing data is set to 'value'\n", + "pop_BE_FR.to_csv('pop_narrow_format.csv', wide=False)\n", + "\n", + "# same but replace 'value' by 'population'\n", + "pop_BE_FR.to_csv('pop_narrow_format.csv', wide=False, value_name='population')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# same for the to_excel method\n", + "pop_BE_FR.to_excel('population.xlsx', 'pop_narrow_format', wide=False, value_name='population')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like with the ``to_excel`` method, it is possible to export arrays in a ``narrow`` format using ``open_excel``. \n", + "To do so, you must set the ``wide`` argument of the ``dump`` method to False:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "with open_excel('population.xlsx') as wb:\n", + " # dump the array pop_BE_FR in a narrow format: \n", + " # one column per axis plus one value column.\n", + " # Argument value_name can be used to change the name of the \n", + " # column containing the data (default name is 'value')\n", + " wb['pop_narrow_format'] = pop_BE_FR.dump(wide=False, value_name='population')\n", + " # don't forget to call save()\n", + " wb.save()\n", + "\n", + "# in the sheet 'pop_narrow_format', data is written as:\n", + "# | country | time | value |\n", + "# | ------- | ---- | -------- |\n", + "# | Belgium | 2013 | 11137974 |\n", + "# | Belgium | 2014 | 11180840 |\n", + "# | Belgium | 2015 | 11237274 |\n", + "# | France | 2013 | 65600350 |\n", + "# | France | 2014 | 65942267 |\n", + "# | France | 2015 | 66456279 |\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Specifying Position in Sheet (Excel)\n", + "\n", + "If you want to read an array from an Excel sheet which does not start at cell `A1` (when there is more than one array stored in the same sheet for example), you will need to use the ``range`` argument. \n", + "\n", + "
\n", + " **Warning:** Note that the ``range`` argument is only available if you have the library ``xlwings`` installed (Windows).\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "# the 'range' argument must be used to load data not starting at cell A1.\n", + "# This is useful when there is several arrays stored in the same sheet\n", + "births = read_excel(filepath_excel, sheet='pop_births_deaths', range='A9:E15')\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using ``open_excel``, ranges are passed in brackets:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "with open_excel(filepath_excel) as wb:\n", + " # store sheet 'pop_births_deaths' in a temporary variable sh\n", + " sh = wb['pop_births_deaths']\n", + " # load the array pop from range A1:E7\n", + " pop = sh['A1:E7'].load()\n", + " # load the array births from range A9:E15\n", + " births = sh['A9:E15'].load()\n", + " # load the array deaths from range A17:E23\n", + " deaths = sh['A17:E23'].load()\n", + "\n", + "# the Workbook is automatically closed when getting out the block defined by the with statement\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When exporting arrays to Excel files, data is written starting at cell `A1` by default. Using the ``position`` argument of the ``to_excel`` method, it is possible to specify the top left cell of the dumped data. This can be useful when you want to export several arrays in the same sheet for example\n", + "\n", + "
\n", + " **Warning:** Note that the ``position`` argument is only available if you have the library ``xlwings`` installed (Windows).\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "filename = 'population.xlsx'\n", + "sheetname = 'pop_births_deaths'\n", + "\n", + "# save the arrays pop, births and deaths in the same sheet 'pop_births_and_deaths'.\n", + "# The 'position' argument is used to shift the location of the second and third arrays to be dumped\n", + "pop.to_excel(filename, sheetname)\n", + "births.to_excel(filename, sheetname, position='A9')\n", + "deaths.to_excel(filename, sheetname, position='A17')\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using ``open_excel``, the position is passed in brackets (this allows you to also add extra informations): \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "with open_excel('population.xlsx') as wb:\n", + " # add a new sheet 'pop_births_deaths' and write 'population' in the first cell\n", + " # note: you can use wb['new_sheet_name'] = '' to create an empty sheet\n", + " wb['pop_births_deaths'] = 'population'\n", + " # store sheet 'pop_births_deaths' in a temporary variable sh\n", + " sh = wb['pop_births_deaths']\n", + " # dump the array pop in sheet 'pop_births_deaths' starting at cell A2\n", + " sh['A2'] = pop.dump()\n", + " # add 'births' in cell A10\n", + " sh['A10'] = 'births'\n", + " # dump the array births in sheet 'pop_births_deaths' starting at cell A11 \n", + " sh['A11'] = births.dump()\n", + " # add 'deaths' in cell A19\n", + " sh['A19'] = 'deaths'\n", + " # dump the array deaths in sheet 'pop_births_deaths' starting at cell A20\n", + " sh['A20'] = deaths.dump()\n", + " # don't forget to call save()\n", + " wb.save()\n", + " \n", + "# the Workbook is automatically closed when getting out the block defined by the with statement\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exporting data without headers (Excel)\n", + "\n", + "For some reasons, you may want to export only the data of an array without axes. For example, you may want to insert a new column containing extra information. As an exercise, let us consider we want to add the capital city for each country present in the array containing the total population by country:\n", + "\n", + "| country | capital city | 2013 | 2014 | 2015 |\n", + "| ------- | ------------ | -------- | -------- | -------- |\n", + "| Belgium | Brussels | 11137974 | 11180840 | 11237274 |\n", + "| France | Paris | 65600350 | 65942267 | 66456279 |\n", + "| Germany | Berlin | 80523746 | 80767463 | 81197537 |\n", + "\n", + "Assuming you have prepared an excel sheet as below: \n", + "\n", + "| country | capital city | 2013 | 2014 | 2015 |\n", + "| ------- | ------------ | -------- | -------- | -------- |\n", + "| Belgium | Brussels | | | |\n", + "| France | Paris | | | |\n", + "| Germany | Berlin | | | ||\n", + "\n", + "you can then dump the data at right place by setting the ``header`` argument of ``to_excel`` to False and specifying the position of the data in sheet:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "pop_by_country = pop.sum('gender')\n", + "\n", + "# export only the data of the array pop_by_country starting at cell C2\n", + "pop_by_country.to_excel('population.xlsx', 'pop_by_country', header=False, position='C2')\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using ``open_excel``, you can easily prepare the sheet and then export only data at the right place by either setting the ``header`` argument of the ``dump`` method to False or avoiding to call ``dump``:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "with open_excel('population.xlsx') as wb:\n", + " # create new empty sheet 'pop_by_country'\n", + " wb['pop_by_country'] = ''\n", + " # store sheet 'pop_by_country' in a temporary variable sh\n", + " sh = wb['pop_by_country']\n", + " # write extra information (description)\n", + " sh['A1'] = 'Population at 1st January by country'\n", + " # export column names\n", + " sh['A2'] = ['country', 'capital city']\n", + " sh['C2'] = pop_by_country.time.labels\n", + " # export countries as first column\n", + " sh['A3'].options(transpose=True).value = pop_by_country.country.labels\n", + " # export capital cities as second column\n", + " sh['B3'].options(transpose=True).value = ['Brussels', 'Paris', 'Berlin']\n", + " # export only data of pop_by_country\n", + " sh['C3'] = pop_by_country.dump(header=False)\n", + " # or equivalently\n", + " sh['C3'] = pop_by_country\n", + " # don't forget to call save()\n", + " wb.save()\n", + " \n", + "# the Workbook is automatically closed when getting out the block defined by the with statement\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Specifying the Number of Axes at Reading (CSV, Excel)\n", + "\n", + "By default, ``read_csv`` and ``read_excel`` will search the position of the first cell containing the special character ``\\`` in the header line in order to determine the number of axes of the array to read. The special character ``\\`` is used to separate the name of the two last axes. If there is no special character ``\\``, ``read_csv`` and ``read_excel`` will consider that the array to read has only one dimension. For an array stored as:\n", + "\n", + "| country | gender \\\\ time | 2013 | 2014 | 2015 |\n", + "| ------- | -------------- | -------- | -------- | -------- |\n", + "| Belgium | Male | 5472856 | 5493792 | 5524068 |\n", + "| Belgium | Female | 5665118 | 5687048 | 5713206 |\n", + "| France | Male | 31772665 | 31936596 | 32175328 |\n", + "| France | Female | 33827685 | 34005671 | 34280951 |\n", + "| Germany | Male | 39380976 | 39556923 | 39835457 |\n", + "| Germany | Female | 41142770 | 41210540 | 41362080 |\n", + "\n", + "``read_csv`` and ``read_excel`` will find the special character ``\\`` in the second cell meaning it expects three axes (country, gender and time). \n", + "\n", + "Sometimes, you need to read an array for which the name of the last axis is implicit: \n", + "\n", + "| country | gender | 2013 | 2014 | 2015 |\n", + "| ------- | ------ | -------- | -------- | -------- |\n", + "| Belgium | Male | 5472856 | 5493792 | 5524068 |\n", + "| Belgium | Female | 5665118 | 5687048 | 5713206 |\n", + "| France | Male | 31772665 | 31936596 | 32175328 |\n", + "| France | Female | 33827685 | 34005671 | 34280951 |\n", + "| Germany | Male | 39380976 | 39556923 | 39835457 |\n", + "| Germany | Female | 41142770 | 41210540 | 41362080 |\n", + "\n", + "For such case, you will have to inform ``read_csv`` and ``read_excel`` of the number of axes of the output array by setting the ``nb_axes`` argument:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read the 3 x 2 x 3 array stored in the file 'pop_missing_axis_name.csv' wihout using 'nb_axes' argument.\n", + "pop = read_csv(csv_dir + '/pop_missing_axis_name.csv')\n", + "# shape and data type of the output array are not what we expected\n", + "pop.info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# by setting the 'nb_axes' argument, you can indicate to read_csv the number of axes of the output array\n", + "pop = read_csv(csv_dir + '/pop_missing_axis_name.csv', nb_axes=3)\n", + "\n", + "# give a name to the last axis\n", + "pop = pop.rename(-1, 'time')\n", + "\n", + "# shape and data type of the output array are what we expected\n", + "pop.info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# same for the read_excel function\n", + "pop = read_excel(filepath_excel, sheet='pop_missing_axis_name', nb_axes=3)\n", + "pop = pop.rename(-1, 'time')\n", + "pop.info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NaNs and Missing Data Handling at Reading (CSV, Excel)\n", + "\n", + "Sometimes, there is no data available for some label combinations. In the example below, the rows corresponding to `France - Male` and `Germany - Female` are missing:\n", + "\n", + "| country | gender \\\\ time | 2013 | 2014 | 2015 |\n", + "| ------- | -------------- | -------- | -------- | -------- |\n", + "| Belgium | Male | 5472856 | 5493792 | 5524068 |\n", + "| Belgium | Female | 5665118 | 5687048 | 5713206 |\n", + "| France | Female | 33827685 | 34005671 | 34280951 |\n", + "| Germany | Male | 39380976 | 39556923 | 39835457 |\n", + "\n", + "By default, ``read_csv`` and ``read_excel`` will fill cells associated with missing label combinations with nans. \n", + "Be aware that, in that case, an int array will be converted to a float array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# by default, cells associated will missing label combinations are filled with nans.\n", + "# In that case, the output array is converted to a float array\n", + "read_csv(csv_dir + '/pop_missing_values.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, it is possible to choose which value to use to fill missing cells using the ``fill_value`` argument:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "read_csv(csv_dir + '/pop_missing_values.csv', fill_value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# same for the read_excel function\n", + "read_excel(filepath_excel, sheet='pop_missing_values', fill_value=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sorting Axes at Reading (CSV, Excel, HDF5)\n", + "\n", + "The ``sort_rows`` and ``sort_columns`` arguments of the reading functions allows you to sort rows and columns alphabetically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# sort labels at reading --> Male and Female labels are inverted\n", + "read_csv(csv_dir + '/pop.csv', sort_rows=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "read_excel(filepath_excel, sheet='births', sort_rows=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "read_hdf(filepath_hdf, key='deaths').sort_axes()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Metadata (HDF5)\n", + "\n", + "Since the version 0.29 of LArray, it is possible to add metadata to arrays:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop.meta.title = 'Population at 1st January'\n", + "pop.meta.origin = 'Table demo_jpan from Eurostat'\n", + "\n", + "pop.info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These metadata are automatically saved and loaded when working with the HDF5 file format: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop.to_hdf('population.h5', 'pop')\n", + "\n", + "new_pop = read_hdf('population.h5', 'pop')\n", + "new_pop.info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " **Warning:** Currently, metadata associated with arrays cannot be saved and loaded when working with CSV and Excel files.\n", + " This restriction does not apply however to metadata associated with sessions.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading and Dumping Sessions\n", + "\n", + "One of the main advantages of grouping arrays, axes and groups in session objects is that you can load and save all of them in one shot. Like arrays, it is possible to associate metadata to a session. These can be saved and loaded in all file formats. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading Sessions (CSV, Excel, HDF5)\n", + "\n", + "To load the items of a session, you have two options:\n", + "\n", + "1) Instantiate a new session and pass the path to the Excel/HDF5 file or to the directory containing CSV files to the Session constructor:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new Session object and load all arrays, axes, groups and metadata \n", + "# from all CSV files located in the passed directory\n", + "csv_dir = get_example_filepath('demography_eurostat')\n", + "session = Session(csv_dir)\n", + "\n", + "# create a new Session object and load all arrays, axes, groups and metadata\n", + "# stored in the passed Excel file\n", + "filepath_excel = get_example_filepath('demography_eurostat.xlsx')\n", + "session = Session(filepath_excel)\n", + "\n", + "# create a new Session object and load all arrays, axes, groups and metadata\n", + "# stored in the passed HDF5 file\n", + "filepath_hdf = get_example_filepath('demography_eurostat.h5')\n", + "session = Session(filepath_hdf)\n", + "\n", + "print(session.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) Call the ``load`` method on an existing session and pass the path to the Excel/HDF5 file or to the directory containing CSV files as first argument:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a session containing 3 axes, 2 groups and one array 'pop'\n", + "filepath = get_example_filepath('pop_only.xlsx')\n", + "session = Session(filepath)\n", + "\n", + "print(session.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# call the load method on the previous session and add the 'births' and 'deaths' arrays to it\n", + "filepath = get_example_filepath('births_and_deaths.xlsx')\n", + "session.load(filepath)\n", + "\n", + "print(session.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ``load`` method offers some options:\n", + "\n", + "1) Using the ``names`` argument, you can specify which items to load:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "session = Session()\n", + "\n", + "# use the names argument to only load births and deaths arrays\n", + "session.load(filepath_hdf, names=['births', 'deaths'])\n", + "\n", + "print(session.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) Setting the ``display`` argument to True, the ``load`` method will print a message each time a new item is loaded: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "session = Session()\n", + "\n", + "# with display=True, the load method will print a message\n", + "# each time a new item is loaded\n", + "session.load(filepath_hdf, display=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dumping Sessions (CSV, Excel, HDF5)\n", + "\n", + "To save a session, you need to call the ``save`` method. The first argument is the path to a Excel/HDF5 file or to a directory if items are saved to CSV files:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save items of a session in CSV files.\n", + "# Here, the save method will create a 'population' directory in which CSV files will be written \n", + "session.save('population')\n", + "\n", + "# save session to an HDF5 file\n", + "session.save('population.h5')\n", + "\n", + "# save session to an Excel file\n", + "session.save('population.xlsx')\n", + "\n", + "# load session saved in 'population.h5' to see its content\n", + "Session('population.h5')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " Note: Concerning the CSV and Excel formats: \n", + " \n", + " - all Axis objects are saved together in the same Excel sheet (CSV file) named `__axes__(.csv)` \n", + " - all Group objects are saved together in the same Excel sheet (CSV file) named `__groups__(.csv)` \n", + " - metadata is saved in one Excel sheet (CSV file) named `__metadata__(.csv)` \n", + " \n", + " These sheet (CSV file) names cannot be changed. \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ``save`` method has several arguments:\n", + "\n", + "1) Using the ``names`` argument, you can specify which items to save:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# use the names argument to only save births and deaths arrays\n", + "session.save('population.h5', names=['births', 'deaths'])\n", + "\n", + "# load session saved in 'population.h5' to see its content\n", + "Session('population.h5')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) By default, dumping a session to an Excel or HDF5 file will overwrite it. By setting the ``overwrite`` argument to False, you can choose to update the existing Excel or HDF5 file: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop = read_csv('./population/pop.csv')\n", + "ses_pop = Session([('pop', pop)])\n", + "\n", + "# by setting overwrite to False, the destination file is updated instead of overwritten.\n", + "# The items already stored in the file but not present in the session are left intact. \n", + "# On the contrary, the items that exist in both the file and the session are completely overwritten.\n", + "ses_pop.save('population.h5', overwrite=False)\n", + "\n", + "# load session saved in 'population.h5' to see its content\n", + "Session('population.h5')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3) Setting the ``display`` argument to True, the ``save`` method will print a message each time an item is dumped: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# with display=True, the save method will print a message\n", + "# each time an item is dumped\n", + "session.save('population.h5', display=True)" + ] + } + ], + "metadata": { + "celltoolbar": "Edit Metadata", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "livereveal": { + "autolaunch": false, + "scroll": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/doc/source/tutorial/tutorial_arithmetic_op_and_aggregation.ipyml b/doc/source/tutorial/tutorial_arithmetic_op_and_aggregation.ipyml index 3961c6cd6..2b1e2ef07 100644 --- a/doc/source/tutorial/tutorial_arithmetic_op_and_aggregation.ipyml +++ b/doc/source/tutorial/tutorial_arithmetic_op_and_aggregation.ipyml @@ -9,7 +9,6 @@ cells: import warnings warnings.filterwarnings("ignore", message=r'.*numpy.dtype size changed*') - id: 0 metadata: nbsphinx: hidden @@ -20,7 +19,6 @@ cells: - code: | from larray import * - id: 1 - markdown: | Check the version of LArray: @@ -30,22 +28,15 @@ cells: from larray import __version__ __version__ - id: 2 - markdown: | ## Arithmetic operations -- markdown: | - Import a subset of the test array ``pop``: - - - code: | - # import a 6 x 2 x 2 subset of the 'pop' example array - pop = load_example_data('demography').pop[2016, 'BruCap', 90:95] - pop + arr = ndtest((3, 3)) + arr - id: 3 - markdown: | One can do all usual arithmetic operations on an array, it will apply the operation to all elements individually @@ -53,237 +44,304 @@ cells: - code: | # addition - pop + 200 + arr + 10 - id: 4 - code: | # multiplication - pop * 2 + arr * 2 - id: 5 - code: | - # ** means raising to the power (squaring in this case) - pop ** 2 + # 'true' division + arr / 2 + + +- code: | + # 'floor' division + arr // 2 + + +- markdown: | +
+ **Warning:** Python has two different division operators: + + - the 'true' division (/) always returns a float. + - the 'floor' division (//) returns an integer result (discarding any fractional result). +
- id: 6 - code: | # % means modulo (aka remainder of division) - pop % 10 + arr % 5 + + +- code: | + # ** means raising to the power + arr ** 3 - id: 7 - markdown: | - More interestingly, it also works between two arrays + More interestingly, binary operators as above also works between two arrays: - code: | - # load mortality equivalent array - mortality = load_example_data('demography').qx[2016, 'BruCap', 90:95] + # load the 'demography_eurostat' dataset + demo_eurostat = load_example_data('demography_eurostat') - # compute number of deaths - death = pop * mortality - death + # extract the 'pop' array + pop = demo_eurostat.pop + pop + + +- code: | + aggregation_matrix = LArray([[1, 0, 0], [0, 1, 1]], axes=(Axis('country=Belgium,France+Germany'), pop.country)) + aggregation_matrix + + +- code: | + # @ means matrix product + aggregation_matrix @ pop['Male'] - id: 8 - markdown: |
**Note:** Be careful when mixing different data types. - You can use the method ``astype`` to change the data type of an array. + You can use the method [astype](../_generated/larray.LArray.astype.rst#larray.LArray.astype) to change the data type of an array.
- code: | - # to be sure to get number of deaths as integers - # one can use .astype() method - death = (pop * mortality).astype(int) - death + aggregation_matrix = LArray([[1, 0, 0], [0, 0.5, 0.5]], axes=(Axis('country=Belgium,France+Germany/2'), pop.country)) + aggregation_matrix + + +- code: | + aggregation_matrix @ pop['Male'] + + +- code: | + # force the resulting matrix to be an integer matrix + (aggregation_matrix @ pop['Male']).astype(int) - id: 9 - markdown: | -
- **Warning:** Operations between two arrays only works when they have compatible axes (i.e. same labels). - However, it can be override but at your own risk. - In that case only the position on the axis is used and not the labels. -
+ ### Axis order does not matter much (except for output) + + You can do operations between arrays having different axes order. + The axis order of the result is the same as the left array - code: | - pop[90:92] * mortality[93:95] + # extract the 'births' array + births = demo_eurostat.births + + # let's change the order of axes of the 'births' array + births_transposed = births.transpose() + births_transposed - id: 10 - code: | - pop[90:92] * mortality[93:95].ignore_labels('age') + # LArray doesn't care of axes order when performing + # arithmetic operations between arrays + pop + births_transposed - id: 11 - markdown: | - ### Boolean Operations + ### Axes must be compatible + + Arithmetic operations between two arrays only works when they have compatible axes (i.e. same labels). - code: | - pop2 = pop.copy() - pop2['F'] = -pop2['F'] - pop2 + # the 'pop' and 'births' have compatible axes + pop + births - id: 12 - code: | - # testing for equality is done using == (a single = assigns the value) - pop == pop2 + # Now, let's replace the country names by the country codes + births_codes = births.set_labels('country', ['BE', 'FR', 'DE']) + births_codes - id: 13 - code: | - # testing for inequality - pop != pop2 + # arithmetic operations between arrays + # having incompatible axes raise an error + try: + pop + births_codes + except Exception as e: + print(type(e).__name__, e) - id: 14 -- code: | - # what was our original array like again? - pop +- markdown: | +
+ **Warning:** Operations between two arrays only works when they have compatible axes (i.e. same labels) but this behavior can be override via the [ignore_labels](../_generated/larray.LArray.ignore_labels.rst#larray.LArray.ignore_labels) method. + In that case only the position on the axis is used and not the labels. + Using this method is done at your own risk. +
- id: 15 - code: | - # & means (boolean array) and - (pop >= 500) & (pop <= 1000) + # use the .ignore_labels() method on axis 'country' + # to avoid the incompatible axes error (risky) + pop + births_codes.ignore_labels('country') - id: 16 + metadata: + scrolled: true -- code: | - # | means (boolean array) or - (pop < 500) | (pop > 1000) +- markdown: | + ### Extra Or Missing Axes (Broadcasting) - id: 17 - markdown: | - ### Arithmetic operations with missing axes + The condition that axes must be compatible only applies on common axes. + Arithmetic operations between two arrays can be performed even if the second array has extra or missing axes compared to the first one: - code: | - pop.sum('age') + # let's define a 'multiplicator' vector with + # one value defined for each gender + multiplicator = LArray([-1, 1], axes=pop.gender) + multiplicator - id: 18 - code: | - # arr has 3 dimensions - pop.info + # the multiplication below has been propagated to the + # 'country' and 'time' axes. + # This behavior is called broadcasting + pop * multiplicator - id: 19 -- code: | - # and arr.sum(age) has two - pop.sum('age').info +- markdown: | + ### Boolean Operations + + Python comparison operators are: + + | Operator | Meaning | + |-----------|-------------------------| + |``==`` | equal | + |``!=`` | not equal | + |``>`` | greater than | + |``>=`` | greater than or equal | + |``<`` | less than | + |``<=`` | less than or equal | + + Applying a comparison operator on an array returns a boolean array: - id: 20 - code: | - # you can do operation with missing axes so this works - pop / pop.sum('age') + # test which values are greater than 10 millions + pop > 10e6 - id: 21 - markdown: | - ### Axis order does not matter much (except for output) + Comparison operations can be combined using Python bitwise operators: - You can do operations between arrays having different axes order. - The axis order of the result is the same as the left array + | Operator | Meaning | + |----------|------------------------------------- | + | & | and | + | \| | or | + | ~ | not | - code: | - pop + # test which values are greater than 10 millions and less than 40 millions + (pop > 10e6) & (pop < 40e6) - id: 22 - code: | - # let us change the order of axes - pop_transposed = pop.T - pop_transposed + # test which values are less than 10 millions or greater than 40 millions + (pop < 10e6) | (pop > 40e6) - id: 23 - code: | - # mind blowing - pop_transposed + pop + # test which values are not less than 10 millions + ~(pop < 10e6) - id: 24 - markdown: | - ## Aggregates - - Calculate the sum along an axis: + The returned boolean array can then be used in selections and assignments: - code: | - pop = load_example_data('demography').pop[2016, 'BruCap'] - pop.sum('age') + pop_copy = pop.copy() + + # set all values greater than 40 millions to 40 millions + pop_copy[pop_copy > 40e6] = 40e6 + pop_copy - id: 25 - markdown: | - or along all axes except one by appending `_by` to the aggregation function + Boolean operations can be made between arrays: - code: | - pop[90:95].sum_by('age') - # is equivalent to - pop[90:95].sum('sex', 'nat') + # test where the two arrays have the same values + pop == pop_copy - id: 26 - markdown: | - Calculate the sum along one group: + To test if all values between are equals, use the [equals](../_generated/larray.LArray.equals.rst#larray.LArray.equals) method: - code: | - teens = pop.age[10:20] + pop.equals(pop_copy) + + +- markdown: | + ## Aggregates - pop.sum(teens) + LArray provides many aggregation functions. The list is given in the [Aggregation Functions](../api.rst#aggregation-functions) subsection of the [API Reference](../api.rst) page. + + Aggregation operations can be performed on axes or groups. Axes and groups can be mixed. + + The main rules are: + + - Axes are separated by commas ``,`` + - Groups belonging to the same axis are grouped inside parentheses () - id: 27 - markdown: | - Calculate the sum along two groups: + Calculate the sum along an axis: - code: | - pensioners = pop.age[67:] - - # groups from the same axis must be grouped in a tuple - pop.sum((teens, pensioners)) + pop.sum('gender') - id: 28 - markdown: | - Mixing axes and groups in aggregations: + or several axes (axes are separated by commas ``,``): - code: | - pop.sum((teens, pensioners), 'nat') + pop.sum('country', 'gender') - id: 29 - markdown: | - ### More On Aggregations + Calculate the sum along all axes except one by appending `_by` to the aggregation function: + + +- code: | + pop.sum_by('time') - markdown: | - There are many other aggregation functions: + Calculate the sum along groups (the groups belonging to the same axis must grouped inside parentheses ()): + + +- code: | + even_years = pop.time[2014::2] >> 'even_years' + odd_years = pop.time[2013::2] >> 'odd_years' - - mean, min, max, median, percentile, var (variance), std (standard - deviation) - - labelofmin, labelofmax (label indirect minimum/maxium -- labels where the - value is minimum/maximum) - - indexofmin, indexofmax (positional indirect minimum/maxium -- position - along axis where the value is minimum/maximum) - - cumsum, cumprod (cumulative sum, cumulative product) + pop.sum((odd_years, even_years)) + + +- markdown: | + Mixing axes and groups in aggregations: + + +- code: | + pop.sum('gender', (odd_years, even_years)) # The lines below here may be deleted if you do not need them. @@ -303,25 +361,10 @@ metadata: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.4 + version: 3.7.3 livereveal: autolaunch: false scroll: true nbformat: 4 nbformat_minor: 2 -# --------------------------------------------------------------------------- -data: - [{execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}] - diff --git a/doc/source/tutorial/tutorial_arithmetic_op_and_aggregation.ipynb b/doc/source/tutorial/tutorial_arithmetic_op_and_aggregation.ipynb index 8e9fead86..b6ea22d88 100644 --- a/doc/source/tutorial/tutorial_arithmetic_op_and_aggregation.ipynb +++ b/doc/source/tutorial/tutorial_arithmetic_op_and_aggregation.ipynb @@ -60,11 +60,21 @@ "## Arithmetic operations\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "arr = ndtest((3, 3))\n", + "arr" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Import a subset of the test array ``pop``:\n" + "One can do all usual arithmetic operations on an array, it will apply the operation to all elements individually\n" ] }, { @@ -73,16 +83,18 @@ "metadata": {}, "outputs": [], "source": [ - "# import a 6 x 2 x 2 subset of the 'pop' example array\n", - "pop = load_example_data('demography').pop[2016, 'BruCap', 90:95]\n", - "pop" + "# addition\n", + "arr + 10" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "One can do all usual arithmetic operations on an array, it will apply the operation to all elements individually\n" + "# multiplication\n", + "arr * 2" ] }, { @@ -91,8 +103,8 @@ "metadata": {}, "outputs": [], "source": [ - "# addition\n", - "pop + 200" + "# 'true' division\n", + "arr / 2" ] }, { @@ -101,8 +113,20 @@ "metadata": {}, "outputs": [], "source": [ - "# multiplication\n", - "pop * 2" + "# 'floor' division\n", + "arr // 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "**Warning:** Python has two different division operators: \n", + "\n", + "- the 'true' division (/) always returns a float.\n", + "- the 'floor' division (//) returns an integer result (discarding any fractional result).\n", + "
" ] }, { @@ -111,8 +135,8 @@ "metadata": {}, "outputs": [], "source": [ - "# ** means raising to the power (squaring in this case)\n", - "pop ** 2" + "# % means modulo (aka remainder of division)\n", + "arr % 5" ] }, { @@ -121,15 +145,15 @@ "metadata": {}, "outputs": [], "source": [ - "# % means modulo (aka remainder of division)\n", - "pop % 10" + "# ** means raising to the power\n", + "arr ** 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "More interestingly, it also works between two arrays\n" + "More interestingly, binary operators as above also works between two arrays:\n" ] }, { @@ -138,22 +162,22 @@ "metadata": {}, "outputs": [], "source": [ - "# load mortality equivalent array\n", - "mortality = load_example_data('demography').qx[2016, 'BruCap', 90:95]\n", + "# load the 'demography_eurostat' dataset\n", + "demo_eurostat = load_example_data('demography_eurostat')\n", "\n", - "# compute number of deaths\n", - "death = pop * mortality\n", - "death" + "# extract the 'pop' array\n", + "pop = demo_eurostat.pop\n", + "pop" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "
\n", - "**Note:** Be careful when mixing different data types.\n", - "You can use the method ``astype`` to change the data type of an array.\n", - "
\n" + "aggregation_matrix = LArray([[1, 0, 0], [0, 1, 1]], axes=(Axis('country=Belgium,France+Germany'), pop.country))\n", + "aggregation_matrix" ] }, { @@ -162,20 +186,17 @@ "metadata": {}, "outputs": [], "source": [ - "# to be sure to get number of deaths as integers\n", - "# one can use .astype() method\n", - "death = (pop * mortality).astype(int)\n", - "death" + "# @ means matrix product\n", + "aggregation_matrix @ pop['Male']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - "**Warning:** Operations between two arrays only works when they have compatible axes (i.e. same labels).\n", - "However, it can be override but at your own risk.\n", - "In that case only the position on the axis is used and not the labels.\n", + "
\n", + "**Note:** Be careful when mixing different data types.\n", + "You can use the method [astype](../_generated/larray.LArray.astype.rst#larray.LArray.astype) to change the data type of an array.\n", "
\n" ] }, @@ -185,7 +206,8 @@ "metadata": {}, "outputs": [], "source": [ - "pop[90:92] * mortality[93:95]" + "aggregation_matrix = LArray([[1, 0, 0], [0, 0.5, 0.5]], axes=(Axis('country=Belgium,France+Germany/2'), pop.country))\n", + "aggregation_matrix" ] }, { @@ -194,14 +216,27 @@ "metadata": {}, "outputs": [], "source": [ - "pop[90:92] * mortality[93:95].ignore_labels('age')" + "aggregation_matrix @ pop['Male']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# force the resulting matrix to be an integer matrix\n", + "(aggregation_matrix @ pop['Male']).astype(int)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Boolean Operations\n" + "### Axis order does not matter much (except for output)\n", + "\n", + "You can do operations between arrays having different axes order.\n", + "The axis order of the result is the same as the left array\n" ] }, { @@ -210,9 +245,12 @@ "metadata": {}, "outputs": [], "source": [ - "pop2 = pop.copy()\n", - "pop2['F'] = -pop2['F']\n", - "pop2" + "# extract the 'births' array\n", + "births = demo_eurostat.births\n", + "\n", + "# let's change the order of axes of the 'births' array\n", + "births_transposed = births.transpose()\n", + "births_transposed" ] }, { @@ -221,18 +259,18 @@ "metadata": {}, "outputs": [], "source": [ - "# testing for equality is done using == (a single = assigns the value)\n", - "pop == pop2" + "# LArray doesn't care of axes order when performing \n", + "# arithmetic operations between arrays\n", + "pop + births_transposed" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# testing for inequality\n", - "pop != pop2" + "### Axes must be compatible\n", + "\n", + "Arithmetic operations between two arrays only works when they have compatible axes (i.e. same labels)." ] }, { @@ -241,8 +279,8 @@ "metadata": {}, "outputs": [], "source": [ - "# what was our original array like again?\n", - "pop" + "# the 'pop' and 'births' have compatible axes\n", + "pop + births" ] }, { @@ -251,8 +289,9 @@ "metadata": {}, "outputs": [], "source": [ - "# & means (boolean array) and\n", - "(pop >= 500) & (pop <= 1000)" + "# Now, let's replace the country names by the country codes\n", + "births_codes = births.set_labels('country', ['BE', 'FR', 'DE'])\n", + "births_codes" ] }, { @@ -261,24 +300,51 @@ "metadata": {}, "outputs": [], "source": [ - "# | means (boolean array) or\n", - "(pop < 500) | (pop > 1000)" + "# arithmetic operations between arrays \n", + "# having incompatible axes raise an error\n", + "try:\n", + " pop + births_codes\n", + "except Exception as e:\n", + " print(type(e).__name__, e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Arithmetic operations with missing axes\n" + "
\n", + " **Warning:** Operations between two arrays only works when they have compatible axes (i.e. same labels) but this behavior can be override via the [ignore_labels](../_generated/larray.LArray.ignore_labels.rst#larray.LArray.ignore_labels) method.\n", + "In that case only the position on the axis is used and not the labels.\n", + "Using this method is done at your own risk.\n", + "
\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "pop.sum('age')" + "# use the .ignore_labels() method on axis 'country'\n", + "# to avoid the incompatible axes error (risky)\n", + "pop + births_codes.ignore_labels('country')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extra Or Missing Axes (Broadcasting)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The condition that axes must be compatible only applies on common axes. \n", + "Arithmetic operations between two arrays can be performed even if the second array has extra or missing axes compared to the first one:" ] }, { @@ -287,8 +353,10 @@ "metadata": {}, "outputs": [], "source": [ - "# arr has 3 dimensions\n", - "pop.info" + "# let's define a 'multiplicator' vector with \n", + "# one value defined for each gender\n", + "multiplicator = LArray([-1, 1], axes=pop.gender)\n", + "multiplicator" ] }, { @@ -297,8 +365,30 @@ "metadata": {}, "outputs": [], "source": [ - "# and arr.sum(age) has two\n", - "pop.sum('age').info" + "# the multiplication below has been propagated to the \n", + "# 'country' and 'time' axes.\n", + "# This behavior is called broadcasting\n", + "pop * multiplicator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Boolean Operations\n", + "\n", + "Python comparison operators are: \n", + "\n", + "| Operator | Meaning |\n", + "|-----------|-------------------------|\n", + "|``==`` | equal | \n", + "|``!=`` | not equal | \n", + "|``>`` | greater than | \n", + "|``>=`` | greater than or equal | \n", + "|``<`` | less than | \n", + "|``<=`` | less than or equal |\n", + "\n", + "Applying a comparison operator on an array returns a boolean array:" ] }, { @@ -307,18 +397,21 @@ "metadata": {}, "outputs": [], "source": [ - "# you can do operation with missing axes so this works\n", - "pop / pop.sum('age')" + "# test which values are greater than 10 millions\n", + "pop > 10e6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Axis order does not matter much (except for output)\n", + "Comparison operations can be combined using Python bitwise operators:\n", "\n", - "You can do operations between arrays having different axes order.\n", - "The axis order of the result is the same as the left array\n" + "| Operator | Meaning |\n", + "|----------|------------------------------------- |\n", + "| & | and |\n", + "| \\| | or |\n", + "| ~ | not |" ] }, { @@ -327,7 +420,8 @@ "metadata": {}, "outputs": [], "source": [ - "pop" + "# test which values are greater than 10 millions and less than 40 millions\n", + "(pop > 10e6) & (pop < 40e6)" ] }, { @@ -336,9 +430,8 @@ "metadata": {}, "outputs": [], "source": [ - "# let us change the order of axes\n", - "pop_transposed = pop.T\n", - "pop_transposed" + "# test which values are less than 10 millions or greater than 40 millions\n", + "(pop < 10e6) | (pop > 40e6)" ] }, { @@ -347,17 +440,15 @@ "metadata": {}, "outputs": [], "source": [ - "# mind blowing\n", - "pop_transposed + pop" + "# test which values are not less than 10 millions\n", + "~(pop < 10e6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Aggregates\n", - "\n", - "Calculate the sum along an axis:\n" + "The returned boolean array can then be used in selections and assignments:" ] }, { @@ -366,15 +457,18 @@ "metadata": {}, "outputs": [], "source": [ - "pop = load_example_data('demography').pop[2016, 'BruCap']\n", - "pop.sum('age')" + "pop_copy = pop.copy()\n", + "\n", + "# set all values greater than 40 millions to 40 millions\n", + "pop_copy[pop_copy > 40e6] = 40e6\n", + "pop_copy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "or along all axes except one by appending `_by` to the aggregation function\n" + "Boolean operations can be made between arrays:" ] }, { @@ -383,16 +477,15 @@ "metadata": {}, "outputs": [], "source": [ - "pop[90:95].sum_by('age')\n", - "# is equivalent to\n", - "pop[90:95].sum('sex', 'nat')" + "# test where the two arrays have the same values\n", + "pop == pop_copy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Calculate the sum along one group:\n" + "To test if all values between are equals, use the [equals](../_generated/larray.LArray.equals.rst#larray.LArray.equals) method:" ] }, { @@ -401,16 +494,30 @@ "metadata": {}, "outputs": [], "source": [ - "teens = pop.age[10:20]\n", + "pop.equals(pop_copy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregates\n", + "\n", + "LArray provides many aggregation functions. The list is given in the [Aggregation Functions](../api.rst#aggregation-functions) subsection of the [API Reference](../api.rst) page.\n", + "\n", + "Aggregation operations can be performed on axes or groups. Axes and groups can be mixed. \n", "\n", - "pop.sum(teens)" + "The main rules are: \n", + "\n", + "- Axes are separated by commas ``,``\n", + "- Groups belonging to the same axis are grouped inside parentheses ()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Calculate the sum along two groups:\n" + "Calculate the sum along an axis:" ] }, { @@ -419,17 +526,14 @@ "metadata": {}, "outputs": [], "source": [ - "pensioners = pop.age[67:]\n", - "\n", - "# groups from the same axis must be grouped in a tuple\n", - "pop.sum((teens, pensioners))" + "pop.sum('gender')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Mixing axes and groups in aggregations:\n" + "or several axes (axes are separated by commas ``,``):" ] }, { @@ -438,29 +542,58 @@ "metadata": {}, "outputs": [], "source": [ - "pop.sum((teens, pensioners), 'nat')" + "pop.sum('country', 'gender')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### More On Aggregations\n" + "Calculate the sum along all axes except one by appending `_by` to the aggregation function:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop.sum_by('time')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "There are many other aggregation functions:\n", + "Calculate the sum along groups (the groups belonging to the same axis must grouped inside parentheses ()):\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "even_years = pop.time[2014::2] >> 'even_years'\n", + "odd_years = pop.time[2013::2] >> 'odd_years'\n", "\n", - "- mean, min, max, median, percentile, var (variance), std (standard\n", - " deviation)\n", - "- labelofmin, labelofmax (label indirect minimum/maxium -- labels where the\n", - " value is minimum/maximum)\n", - "- indexofmin, indexofmax (positional indirect minimum/maxium -- position\n", - " along axis where the value is minimum/maximum)\n", - "- cumsum, cumprod (cumulative sum, cumulative product)\n" + "pop.sum((odd_years, even_years))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mixing axes and groups in aggregations:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop.sum('gender', (odd_years, even_years))" ] } ], @@ -481,7 +614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.7.3" }, "livereveal": { "autolaunch": false, diff --git a/doc/source/tutorial/tutorial_indexing.ipyml b/doc/source/tutorial/tutorial_indexing.ipyml index 1771a6a8e..848da0b61 100644 --- a/doc/source/tutorial/tutorial_indexing.ipyml +++ b/doc/source/tutorial/tutorial_indexing.ipyml @@ -9,7 +9,6 @@ cells: import warnings warnings.filterwarnings("ignore", message=r'.*numpy.dtype size changed*') - id: 0 metadata: nbsphinx: hidden @@ -20,7 +19,6 @@ cells: - code: | from larray import * - id: 1 - markdown: | Check the version of LArray: @@ -30,7 +28,6 @@ cells: from larray import __version__ __version__ - id: 2 - markdown: | Import the test array ``pop``: @@ -38,10 +35,9 @@ cells: - code: | # let's start with - pop = load_example_data('demography').pop + pop = load_example_data('demography_eurostat').pop pop - id: 3 - markdown: | ## Selecting (Subsets) @@ -58,103 +54,101 @@ cells: - code: | - # here we select the value associated with Belgian women - # of age 50 from Brussels region for the year 2015 - pop[2015, 'BruCap', 50, 'F', 'BE'] + pop['Belgium', 'Female', 2017] - id: 4 - markdown: | - Continue with selecting a subset using slices and lists of labels + As long as there is no ambiguity (i.e. axes sharing one or several same label(s)), the order of indexing does not matter. + So you usually do not care/have to remember about axes positions during computation. It only matters for output. - code: | - # here we select the subset associated with Belgian women of age 50, 51 and 52 - # from Brussels region for the years 2010 to 2016 - pop[2010:2016, 'BruCap', 50:52, 'F', 'BE'] + # order of index doesn't matter + pop['Female', 2017, 'Belgium'] + + +- markdown: | + Selecting a subset is done by using slices or lists of labels: - id: 5 - code: | - # slices bounds are optional: - # if not given start is assumed to be the first label and stop is the last one. - # Here we select all years starting from 2010 - pop[2010:, 'BruCap', 50:52, 'F', 'BE'] + pop[['Belgium', 'Germany'], 2014:2016] + + +- markdown: | + Slices bounds are optional: + if not given, start is assumed to be the first label and stop is the last one. - id: 6 - code: | - # Slices can also have a step (defaults to 1), to take every Nth labels - # Here we select all even years starting from 2010 - pop[2010::2, 'BruCap', 50:52, 'F', 'BE'] + # select all years starting from 2015 + pop[2015:] - id: 7 - code: | - # one can also use list of labels to take non-contiguous labels. - # Here we select years 2008, 2010, 2013 and 2015 - pop[[2008, 2010, 2013, 2015], 'BruCap', 50:52, 'F', 'BE'] + # select all first years until 2015 + pop[:2015] - id: 8 - markdown: | - The order of indexing does not matter either, so you usually do not care/have to remember about axes positions during computation. It only matters for output. + Slices can also have a step (defaults to 1), to take every Nth labels: - code: | - # order of index doesn't matter - pop['F', 'BE', 'BruCap', [2008, 2010, 2013, 2015], 50:52] + # select all even years starting from 2014 + pop[2014::2] - id: 9 - markdown: |
- **Warning:** Selecting by labels as above works well as long as there is no ambiguity. - When two or more axes have common labels, it may lead to a crash. + **Warning:** Selecting by labels as in above examples works well as long as there is no ambiguity. + When two or more axes have common labels, it leads to a crash. The solution is then to precise to which axis belong the labels.
- code: | - # let us now create an array with the same labels on several axes - age, weight, size = Axis('age=0..80'), Axis('weight=0..120'), Axis('size=0..200') + immigration = load_example_data('demography_eurostat').immigration - arr_ws = ndtest([age, weight, size]) + # the 'immigration' array has two axes (country and citizenship) which share the same labels + immigration - id: 10 - code: | - # let's try to select teenagers with size between 1 m 60 and 1 m 65 and weight > 80 kg. - # In this case the subset is ambiguous and this results in an error: - arr_ws[10:18, :80, 160:165] + # LArray doesn't use the position of the labels used inside the brackets + # to determine the corresponding axes. Instead LArray will try to guess the + # corresponding axis for each label whatever is its position. + # Then, if a label is shared by two or more axes, LArray will not be able + # to choose between the possible axes and will raise an error. + try: + immigration['Belgium', 'Netherlands'] + except Exception as e: + print(type(e).__name__, ':', e) - id: 11 - code: | # the solution is simple. You need to precise the axes on which you make a selection - arr_ws[age[10:18], weight[:80], size[160:165]] + immigration[immigration.country['Belgium'], immigration.citizenship['Netherlands']] - id: 12 - markdown: | ### Ambiguous Cases - Specifying Axes Using The Special Variable X - When selecting, assiging or using aggregate functions, an axis can be - refered via the special variable ``X``: + When selecting, assigning or using aggregate functions, an axis can be + referred via the special variable ``X``: - - pop[X.age[:20]] - - pop.sum(X.age) + - pop[X.time[2015:]] + - pop.sum(X.time) - This gives you acces to axes of the array you are manipulating. The main + This gives you access to axes of the array you are manipulating. The main drawback of using ``X`` is that you lose the autocompletion available from many editors. It only works with non-anonymous axes for which names do not contain whitespaces or special characters. - code: | - # the previous example could have been also written as - arr_ws[X.age[10:18], X.weight[:80], X.size[160:165]] + # the previous example can also be written as + immigration[X.country['Belgium'], X.citizenship['Netherlands']] - id: 13 - markdown: | ### Selecting by Indices @@ -173,23 +167,19 @@ cells: - code: | - # here we select the subset associated with Belgian women of age 50, 51 and 52 - # from Brussels region for the first 3 years - pop[X.time.i[:3], 'BruCap', 50:52, 'F', 'BE'] + # select the last year + pop[X.time.i[-1]] - id: 14 - code: | # same but for the last 3 years - pop[X.time.i[-3:], 'BruCap', 50:52, 'F', 'BE'] + pop[X.time.i[-3:]] - id: 15 - code: | - # using list of indices - pop[X.time.i[-9,-7,-4,-2], 'BruCap', 50:52, 'F', 'BE'] + # using a list of indices + pop[X.time.i[0, 2, 4]] - id: 16 - markdown: |
@@ -198,16 +188,17 @@ cells: - code: | - # with labels (3 is included) - pop[2015, 'BruCap', X.age[:3], 'F', 'BE'] + year = 2015 + + # with labels + pop[X.time[:year]] - id: 17 - code: | - # with indices (3 is out) - pop[2015, 'BruCap', X.age.i[:3], 'F', 'BE'] + # with indices (i.e. using the .i[indices] syntax) + index_year = pop.time.index(year) + pop[X.time.i[:index_year]] - id: 18 - markdown: | You can use ``.i[]`` selection directly on array instead of axes. @@ -215,231 +206,195 @@ cells: - code: | - # here we select the last year and first 3 ages - # equivalent to: pop.i[-1, :, :3, :, :] - pop.i[-1, :, :3] + # select first country and last three years + pop.i[0, :, -3:] - id: 19 - markdown: | ### Using Groups In Selections - code: | - teens = pop.age[10:20] + even_years = pop.time[2014::2] - pop[2015, 'BruCap', teens, 'F', 'BE'] + pop[even_years] - id: 20 - markdown: | - ## Assigning subsets - - ### Assigning A Value + ## Boolean Filtering - Assign a value to a subset + Boolean filtering can be used to extract subsets. Filtering can be done on axes: - code: | - # let's take a smaller array - pop = load_example_data('demography').pop[2016, 'BruCap', 100:105] - pop2 = pop - pop2 + # select even years + pop[X.time % 2 == 0] + + +- markdown: | + or data: - id: 21 - code: | - # set all data corresponding to age >= 102 to 0 - pop2[102:] = 0 - pop2 + # select population for the year 2017 + pop_2017 = pop[2017] + + # select all data with a value greater than 30 million + pop_2017[pop_2017 > 30e6] - id: 22 - markdown: | - One very important gotcha though... - -
- **Warning:** Modifying a slice of an array in-place like we did above should be done with care otherwise you could have **unexpected effects**. The reason is that taking a **slice** subset of an array does not return a copy of that array, but rather a view on that array. To avoid such behavior, use ``.copy()`` method. +
+ **Note:** Be aware that after boolean filtering, several axes may have merged.
- - Remember: - - - taking a slice subset of an array is extremely fast (no data is - copied) - - if one modifies that subset in-place, one also **modifies the - original array** - - **.copy()** returns a copy of the subset (takes speed and memory) but - allows you to change the subset without modifying the original array - in the same time -- code: | - # indeed, data from the original array have also changed - pop +- markdown: | + Arrays can also be used to create boolean filters: - id: 23 - code: | - # the right way - pop = load_example_data('demography').pop[2016, 'BruCap', 100:105] - - pop2 = pop.copy() - pop2[102:] = 0 - pop2 + start_year = LArray([2015, 2016, 2017], axes=pop.country) + start_year - id: 24 - code: | - # now, data from the original array have not changed this time - pop + pop[X.time >= start_year] - id: 25 - markdown: | - ### Assigning Arrays And Broadcasting + ## Assigning subsets - Instead of a value, we can also assign an array to a subset. In that - case, that array can have less axes than the target but those which are - present must be compatible with the subset being targeted. - - -- code: | - sex, nat = Axis('sex=M,F'), Axis('nat=BE,FO') - new_value = LArray([[1, -1], [2, -2]],[sex, nat]) - new_value + ### Assigning A Value + + Assigning a value to a subset is simple: - id: 26 - code: | - # this assigns 1, -1 to Belgian, Foreigner men - # and 2, -2 to Belgian, Foreigner women for all - # people older than 100 - pop[102:] = new_value + pop[2017] = 0 pop - id: 27 - markdown: | -
- **Warning:** The array being assigned must have compatible axes (i.e. same axes names and same labels) with the target subset. -
+ Now, let's store a subset in a new variable and modify it: - code: | - # assume we define the following array with shape 3 x 2 x 2 - new_value = zeros(['age=100..102', sex, nat]) - new_value + # store the data associated with the year 2016 in a new variable + pop_2016 = pop[2016] + pop_2016 - id: 28 - code: | - # now let's try to assign the previous array in a subset from age 103 to 105 - pop[103:105] = new_value - - id: 29 - -- code: | - # but this works - pop[100:102] = new_value + # now, we modify the new variable + pop_2016['Belgium'] = 0 + + # and we can see that the original array has been also modified pop - id: 30 - markdown: | - ## Boolean Filtering + One very important gotcha though... - Boolean filtering can be use to extract subsets. - - -- code: | - #Let's focus on population living in Brussels during the year 2016 - pop = load_example_data('demography').pop[2016, 'BruCap'] +
+ **Warning:** Storing a subset of an array in a new variable and modifying it after may also impact the original array. The reason is that selecting a contiguous subset of the data does not return a copy of the selected subset, but rather a view on a subset of the array. To avoid such behavior, use the ``.copy()`` method. +
- # here we select all males and females with age less than 5 and 10 respectively - subset = pop[((X.sex == 'H') & (X.age <= 5)) | ((X.sex == 'F') & (X.age <= 10))] - subset + Remember: + + - taking a contiguous subset of an array is extremely fast (no data is copied) + - if one modifies that subset, one also **modifies the original array** + - **.copy()** returns a copy of the subset (takes speed and memory) but + allows you to change the subset without modifying the original array + in the same time - id: 31 - markdown: | -
- **Note:** Be aware that after boolean filtering, several axes may have merged. -
+ The same warning apply for entire arrays: - code: | - # 'age' and 'sex' axes have been merged together - subset.info + # reload the 'pop' array + pop = load_example_data('demography_eurostat').pop + + # create a second 'pop2' variable + pop2 = pop + pop2 - id: 32 -- markdown: | - This may be not what you because previous selections on merged axes are no longer valid +- code: | + # set all data corresponding to the year 2017 to 0 + pop2[2017] = 0 + pop2 - code: | - # now let's try to calculate the proportion of females with age less than 10 - subset['F'].sum() / pop['F'].sum() + # and now take a look of what happened to the original array 'pop' + # after modifying the 'pop2' array + pop - id: 33 - markdown: | - Therefore, it is sometimes more useful to not select, but rather set to 0 (or another value) non matching elements +
+ **Warning:** The syntax ``new_array = old_array`` does not create a new array but rather an 'alias' variable. To actually create a new array as a copy of a previous one, the ``.copy()`` method must be called. +
- code: | - subset = pop.copy() - subset[((X.sex == 'F') & (X.age > 10))] = 0 - subset['F', :20] + # reload the 'pop' array + pop = load_example_data('demography_eurostat').pop + + # copy the 'pop' array and store the copy in a new variable + pop2 = pop.copy() + + # modify the copy + pop2[2017] = 0 + pop2 - id: 34 - code: | - # now we can calculate the proportion of females with age less than 10 - subset['F'].sum() / pop['F'].sum() + # the data from the original array have not been modified + pop - id: 35 - markdown: | - Boolean filtering can also mix axes and arrays. Example above could also have been written as - - -- code: | - age_limit = sequence('sex=M,F', initial=5, inc=5) - age_limit - - id: 36 - -- code: | - age = pop.axes['age'] - (age <= age_limit)[:20] + ### Assigning Arrays And Broadcasting + + Instead of a value, we can also assign an array to a subset. In that + case, that array can have less axes than the target but those which are + present must be compatible with the subset being targeted. - id: 37 - code: | - subset = pop.copy() - subset[X.age > age_limit] = 0 - subset['F'].sum() / pop['F'].sum() + # select population for the year 2015 + pop_2015 = pop[2015] + + # propagate population for the year 2015 to all next years + pop[2016:] = pop_2015 + + pop - id: 38 - markdown: | - Finally, you can choose to filter on data instead of axes +
+ **Warning:** The array being assigned must have compatible axes (i.e. same axes names and same labels) with the target subset. +
- code: | - # let's focus on females older than 90 - subset = pop['F', 90:110].copy() - subset + # replace 'Male' and 'Female' labels by 'M' and 'F' + pop_2015 = pop_2015.set_labels('gender', 'M,F') + pop_2015 - id: 39 - code: | - # here we set to 0 all data < 10 - subset[subset < 10] = 0 - subset + # now let's try to repeat the assignement operation above with the new labels. + # An error is raised because of incompatible axes + try: + pop[2016:] = pop_2015 + except Exception as e: + print(type(e).__name__, ':', e) - id: 40 # The lines below here may be deleted if you do not need them. # --------------------------------------------------------------------------- @@ -458,30 +413,10 @@ metadata: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.4 + version: 3.7.3 livereveal: autolaunch: false scroll: true nbformat: 4 nbformat_minor: 2 -# --------------------------------------------------------------------------- -data: - [{execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}] - diff --git a/doc/source/tutorial/tutorial_indexing.ipynb b/doc/source/tutorial/tutorial_indexing.ipynb index d1c6e1cab..e0a116f46 100644 --- a/doc/source/tutorial/tutorial_indexing.ipynb +++ b/doc/source/tutorial/tutorial_indexing.ipynb @@ -67,7 +67,7 @@ "outputs": [], "source": [ "# let's start with\n", - "pop = load_example_data('demography').pop\n", + "pop = load_example_data('demography_eurostat').pop\n", "pop" ] }, @@ -97,16 +97,15 @@ "metadata": {}, "outputs": [], "source": [ - "# here we select the value associated with Belgian women\n", - "# of age 50 from Brussels region for the year 2015\n", - "pop[2015, 'BruCap', 50, 'F', 'BE']" + "pop['Belgium', 'Female', 2017]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Continue with selecting a subset using slices and lists of labels\n" + "As long as there is no ambiguity (i.e. axes sharing one or several same label(s)), the order of indexing does not matter. \n", + "So you usually do not care/have to remember about axes positions during computation. It only matters for output." ] }, { @@ -115,9 +114,15 @@ "metadata": {}, "outputs": [], "source": [ - "# here we select the subset associated with Belgian women of age 50, 51 and 52\n", - "# from Brussels region for the years 2010 to 2016\n", - "pop[2010:2016, 'BruCap', 50:52, 'F', 'BE']" + "# order of index doesn't matter\n", + "pop['Female', 2017, 'Belgium']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Selecting a subset is done by using slices or lists of labels:" ] }, { @@ -126,10 +131,15 @@ "metadata": {}, "outputs": [], "source": [ - "# slices bounds are optional:\n", - "# if not given start is assumed to be the first label and stop is the last one.\n", - "# Here we select all years starting from 2010\n", - "pop[2010:, 'BruCap', 50:52, 'F', 'BE']" + "pop[['Belgium', 'Germany'], 2014:2016]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Slices bounds are optional:\n", + "if not given, start is assumed to be the first label and stop is the last one." ] }, { @@ -138,9 +148,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Slices can also have a step (defaults to 1), to take every Nth labels\n", - "# Here we select all even years starting from 2010\n", - "pop[2010::2, 'BruCap', 50:52, 'F', 'BE']" + "# select all years starting from 2015\n", + "pop[2015:]" ] }, { @@ -149,16 +158,15 @@ "metadata": {}, "outputs": [], "source": [ - "# one can also use list of labels to take non-contiguous labels.\n", - "# Here we select years 2008, 2010, 2013 and 2015\n", - "pop[[2008, 2010, 2013, 2015], 'BruCap', 50:52, 'F', 'BE']" + "# select all first years until 2015\n", + "pop[:2015]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The order of indexing does not matter either, so you usually do not care/have to remember about axes positions during computation. It only matters for output.\n" + "Slices can also have a step (defaults to 1), to take every Nth labels:" ] }, { @@ -167,8 +175,8 @@ "metadata": {}, "outputs": [], "source": [ - "# order of index doesn't matter\n", - "pop['F', 'BE', 'BruCap', [2008, 2010, 2013, 2015], 50:52]" + "# select all even years starting from 2014\n", + "pop[2014::2]" ] }, { @@ -176,8 +184,8 @@ "metadata": {}, "source": [ "
\n", - "**Warning:** Selecting by labels as above works well as long as there is no ambiguity.\n", - " When two or more axes have common labels, it may lead to a crash.\n", + "**Warning:** Selecting by labels as in above examples works well as long as there is no ambiguity.\n", + " When two or more axes have common labels, it leads to a crash.\n", " The solution is then to precise to which axis belong the labels.\n", "
\n" ] @@ -188,10 +196,10 @@ "metadata": {}, "outputs": [], "source": [ - "# let us now create an array with the same labels on several axes\n", - "age, weight, size = Axis('age=0..80'), Axis('weight=0..120'), Axis('size=0..200')\n", + "immigration = load_example_data('demography_eurostat').immigration\n", "\n", - "arr_ws = ndtest([age, weight, size])" + "# the 'immigration' array has two axes (country and citizenship) which share the same labels\n", + "immigration" ] }, { @@ -200,9 +208,15 @@ "metadata": {}, "outputs": [], "source": [ - "# let's try to select teenagers with size between 1 m 60 and 1 m 65 and weight > 80 kg.\n", - "# In this case the subset is ambiguous and this results in an error:\n", - "arr_ws[10:18, :80, 160:165]" + "# LArray doesn't use the position of the labels used inside the brackets \n", + "# to determine the corresponding axes. Instead LArray will try to guess the \n", + "# corresponding axis for each label whatever is its position.\n", + "# Then, if a label is shared by two or more axes, LArray will not be able \n", + "# to choose between the possible axes and will raise an error.\n", + "try:\n", + " immigration['Belgium', 'Netherlands']\n", + "except Exception as e:\n", + " print(type(e).__name__, ':', e)" ] }, { @@ -212,7 +226,7 @@ "outputs": [], "source": [ "# the solution is simple. You need to precise the axes on which you make a selection\n", - "arr_ws[age[10:18], weight[:80], size[160:165]]" + "immigration[immigration.country['Belgium'], immigration.citizenship['Netherlands']]" ] }, { @@ -221,13 +235,13 @@ "source": [ "### Ambiguous Cases - Specifying Axes Using The Special Variable X\n", "\n", - "When selecting, assiging or using aggregate functions, an axis can be\n", - "refered via the special variable ``X``:\n", + "When selecting, assigning or using aggregate functions, an axis can be\n", + "referred via the special variable ``X``:\n", "\n", - "- pop[X.age[:20]]\n", - "- pop.sum(X.age)\n", + "- pop[X.time[2015:]]\n", + "- pop.sum(X.time)\n", "\n", - "This gives you acces to axes of the array you are manipulating. The main\n", + "This gives you access to axes of the array you are manipulating. The main\n", "drawback of using ``X`` is that you lose the autocompletion available from\n", "many editors. It only works with non-anonymous axes for which names do not contain whitespaces or special characters.\n" ] @@ -238,8 +252,8 @@ "metadata": {}, "outputs": [], "source": [ - "# the previous example could have been also written as\n", - "arr_ws[X.age[10:18], X.weight[:80], X.size[160:165]]" + "# the previous example can also be written as\n", + "immigration[X.country['Belgium'], X.citizenship['Netherlands']]" ] }, { @@ -270,9 +284,8 @@ "metadata": {}, "outputs": [], "source": [ - "# here we select the subset associated with Belgian women of age 50, 51 and 52\n", - "# from Brussels region for the first 3 years\n", - "pop[X.time.i[:3], 'BruCap', 50:52, 'F', 'BE']" + "# select the last year\n", + "pop[X.time.i[-1]]" ] }, { @@ -282,7 +295,7 @@ "outputs": [], "source": [ "# same but for the last 3 years\n", - "pop[X.time.i[-3:], 'BruCap', 50:52, 'F', 'BE']" + "pop[X.time.i[-3:]]" ] }, { @@ -291,8 +304,8 @@ "metadata": {}, "outputs": [], "source": [ - "# using list of indices\n", - "pop[X.time.i[-9,-7,-4,-2], 'BruCap', 50:52, 'F', 'BE']" + "# using a list of indices\n", + "pop[X.time.i[0, 2, 4]]" ] }, { @@ -310,8 +323,10 @@ "metadata": {}, "outputs": [], "source": [ - "# with labels (3 is included)\n", - "pop[2015, 'BruCap', X.age[:3], 'F', 'BE']" + "year = 2015\n", + "\n", + "# with labels\n", + "pop[X.time[:year]]" ] }, { @@ -320,8 +335,9 @@ "metadata": {}, "outputs": [], "source": [ - "# with indices (3 is out)\n", - "pop[2015, 'BruCap', X.age.i[:3], 'F', 'BE']" + "# with indices (i.e. using the .i[indices] syntax)\n", + "index_year = pop.time.index(year)\n", + "pop[X.time.i[:index_year]]" ] }, { @@ -338,9 +354,8 @@ "metadata": {}, "outputs": [], "source": [ - "# here we select the last year and first 3 ages\n", - "# equivalent to: pop.i[-1, :, :3, :, :]\n", - "pop.i[-1, :, :3]" + "# select first country and last three years\n", + "pop.i[0, :, -3:]" ] }, { @@ -356,20 +371,18 @@ "metadata": {}, "outputs": [], "source": [ - "teens = pop.age[10:20]\n", + "even_years = pop.time[2014::2]\n", "\n", - "pop[2015, 'BruCap', teens, 'F', 'BE']" + "pop[even_years]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Assigning subsets\n", - "\n", - "### Assigning A Value\n", + "## Boolean Filtering\n", "\n", - "Assign a value to a subset\n" + "Boolean filtering can be used to extract subsets. Filtering can be done on axes:" ] }, { @@ -378,10 +391,15 @@ "metadata": {}, "outputs": [], "source": [ - "# let's take a smaller array\n", - "pop = load_example_data('demography').pop[2016, 'BruCap', 100:105]\n", - "pop2 = pop\n", - "pop2" + "# select even years\n", + "pop[X.time % 2 == 0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "or data:" ] }, { @@ -390,40 +408,27 @@ "metadata": {}, "outputs": [], "source": [ - "# set all data corresponding to age >= 102 to 0\n", - "pop2[102:] = 0\n", - "pop2" + "# select population for the year 2017\n", + "pop_2017 = pop[2017]\n", + "\n", + "# select all data with a value greater than 30 million\n", + "pop_2017[pop_2017 > 30e6]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "One very important gotcha though...\n", - "\n", - "
\n", - "**Warning:** Modifying a slice of an array in-place like we did above should be done with care otherwise you could have **unexpected effects**. The reason is that taking a **slice** subset of an array does not return a copy of that array, but rather a view on that array. To avoid such behavior, use ``.copy()`` method.\n", - "
\n", - "\n", - "Remember:\n", - "\n", - "- taking a slice subset of an array is extremely fast (no data is\n", - " copied)\n", - "- if one modifies that subset in-place, one also **modifies the\n", - " original array**\n", - "- **.copy()** returns a copy of the subset (takes speed and memory) but\n", - " allows you to change the subset without modifying the original array\n", - " in the same time\n" + "
\n", + "**Note:** Be aware that after boolean filtering, several axes may have merged.\n", + "
\n" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# indeed, data from the original array have also changed\n", - "pop" + "Arrays can also be used to create boolean filters:\n" ] }, { @@ -432,12 +437,8 @@ "metadata": {}, "outputs": [], "source": [ - "# the right way\n", - "pop = load_example_data('demography').pop[2016, 'BruCap', 100:105]\n", - "\n", - "pop2 = pop.copy()\n", - "pop2[102:] = 0\n", - "pop2" + "start_year = LArray([2015, 2016, 2017], axes=pop.country)\n", + "start_year" ] }, { @@ -446,30 +447,18 @@ "metadata": {}, "outputs": [], "source": [ - "# now, data from the original array have not changed this time\n", - "pop" + "pop[X.time >= start_year]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Assigning Arrays And Broadcasting\n", + "## Assigning subsets\n", "\n", - "Instead of a value, we can also assign an array to a subset. In that\n", - "case, that array can have less axes than the target but those which are\n", - "present must be compatible with the subset being targeted.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sex, nat = Axis('sex=M,F'), Axis('nat=BE,FO')\n", - "new_value = LArray([[1, -1], [2, -2]],[sex, nat])\n", - "new_value" + "### Assigning A Value\n", + "\n", + "Assigning a value to a subset is simple:\n" ] }, { @@ -478,10 +467,7 @@ "metadata": {}, "outputs": [], "source": [ - "# this assigns 1, -1 to Belgian, Foreigner men\n", - "# and 2, -2 to Belgian, Foreigner women for all\n", - "# people older than 100\n", - "pop[102:] = new_value\n", + "pop[2017] = 0\n", "pop" ] }, @@ -489,20 +475,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - "**Warning:** The array being assigned must have compatible axes (i.e. same axes names and same labels) with the target subset.\n", - "
\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# assume we define the following array with shape 3 x 2 x 2\n", - "new_value = zeros(['age=100..102', sex, nat])\n", - "new_value" + "Now, let's store a subset in a new variable and modify it:" ] }, { @@ -511,8 +484,9 @@ "metadata": {}, "outputs": [], "source": [ - "# now let's try to assign the previous array in a subset from age 103 to 105\n", - "pop[103:105] = new_value" + "# store the data associated with the year 2016 in a new variable\n", + "pop_2016 = pop[2016]\n", + "pop_2016" ] }, { @@ -521,8 +495,10 @@ "metadata": {}, "outputs": [], "source": [ - "# but this works\n", - "pop[100:102] = new_value\n", + "# now, we modify the new variable\n", + "pop_2016['Belgium'] = 0\n", + "\n", + "# and we can see that the original array has been also modified\n", "pop" ] }, @@ -530,32 +506,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Boolean Filtering\n", + "One very important gotcha though...\n", "\n", - "Boolean filtering can be use to extract subsets.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Let's focus on population living in Brussels during the year 2016\n", - "pop = load_example_data('demography').pop[2016, 'BruCap']\n", + "
\n", + "**Warning:** Storing a subset of an array in a new variable and modifying it after may also impact the original array. The reason is that selecting a contiguous subset of the data does not return a copy of the selected subset, but rather a view on a subset of the array. To avoid such behavior, use the ``.copy()`` method.\n", + "
\n", "\n", - "# here we select all males and females with age less than 5 and 10 respectively\n", - "subset = pop[((X.sex == 'H') & (X.age <= 5)) | ((X.sex == 'F') & (X.age <= 10))]\n", - "subset" + "Remember:\n", + "\n", + "- taking a contiguous subset of an array is extremely fast (no data is copied)\n", + "- if one modifies that subset, one also **modifies the original array**\n", + "- **.copy()** returns a copy of the subset (takes speed and memory) but\n", + " allows you to change the subset without modifying the original array\n", + " in the same time\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - "**Note:** Be aware that after boolean filtering, several axes may have merged.\n", - "
\n" + "The same warning apply for entire arrays:" ] }, { @@ -564,15 +534,23 @@ "metadata": {}, "outputs": [], "source": [ - "# 'age' and 'sex' axes have been merged together\n", - "subset.info" + "# reload the 'pop' array\n", + "pop = load_example_data('demography_eurostat').pop\n", + "\n", + "# create a second 'pop2' variable\n", + "pop2 = pop\n", + "pop2" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "This may be not what you because previous selections on merged axes are no longer valid\n" + "# set all data corresponding to the year 2017 to 0\n", + "pop2[2017] = 0\n", + "pop2" ] }, { @@ -581,15 +559,18 @@ "metadata": {}, "outputs": [], "source": [ - "# now let's try to calculate the proportion of females with age less than 10\n", - "subset['F'].sum() / pop['F'].sum()" + "# and now take a look of what happened to the original array 'pop'\n", + "# after modifying the 'pop2' array\n", + "pop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Therefore, it is sometimes more useful to not select, but rather set to 0 (or another value) non matching elements\n" + "
\n", + "**Warning:** The syntax ``new_array = old_array`` does not create a new array but rather an 'alias' variable. To actually create a new array as a copy of a previous one, the ``.copy()`` method must be called.\n", + "
" ] }, { @@ -598,9 +579,15 @@ "metadata": {}, "outputs": [], "source": [ - "subset = pop.copy()\n", - "subset[((X.sex == 'F') & (X.age > 10))] = 0\n", - "subset['F', :20]" + "# reload the 'pop' array\n", + "pop = load_example_data('demography_eurostat').pop\n", + "\n", + "# copy the 'pop' array and store the copy in a new variable\n", + "pop2 = pop.copy()\n", + "\n", + "# modify the copy\n", + "pop2[2017] = 0\n", + "pop2" ] }, { @@ -609,35 +596,19 @@ "metadata": {}, "outputs": [], "source": [ - "# now we can calculate the proportion of females with age less than 10\n", - "subset['F'].sum() / pop['F'].sum()" + "# the data from the original array have not been modified\n", + "pop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Boolean filtering can also mix axes and arrays. Example above could also have been written as\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "age_limit = sequence('sex=M,F', initial=5, inc=5)\n", - "age_limit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "age = pop.axes['age']\n", - "(age <= age_limit)[:20]" + "### Assigning Arrays And Broadcasting\n", + "\n", + "Instead of a value, we can also assign an array to a subset. In that\n", + "case, that array can have less axes than the target but those which are\n", + "present must be compatible with the subset being targeted.\n" ] }, { @@ -646,16 +617,22 @@ "metadata": {}, "outputs": [], "source": [ - "subset = pop.copy()\n", - "subset[X.age > age_limit] = 0\n", - "subset['F'].sum() / pop['F'].sum()" + "# select population for the year 2015\n", + "pop_2015 = pop[2015]\n", + "\n", + "# propagate population for the year 2015 to all next years\n", + "pop[2016:] = pop_2015\n", + "\n", + "pop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, you can choose to filter on data instead of axes\n" + "
\n", + "**Warning:** The array being assigned must have compatible axes (i.e. same axes names and same labels) with the target subset.\n", + "
\n" ] }, { @@ -664,9 +641,9 @@ "metadata": {}, "outputs": [], "source": [ - "# let's focus on females older than 90\n", - "subset = pop['F', 90:110].copy()\n", - "subset" + "# replace 'Male' and 'Female' labels by 'M' and 'F'\n", + "pop_2015 = pop_2015.set_labels('gender', 'M,F')\n", + "pop_2015" ] }, { @@ -675,9 +652,12 @@ "metadata": {}, "outputs": [], "source": [ - "# here we set to 0 all data < 10\n", - "subset[subset < 10] = 0\n", - "subset" + "# now let's try to repeat the assignement operation above with the new labels.\n", + "# An error is raised because of incompatible axes\n", + "try:\n", + " pop[2016:] = pop_2015\n", + "except Exception as e:\n", + " print(type(e).__name__, ':', e)" ] } ], @@ -698,7 +678,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.7.3" }, "livereveal": { "autolaunch": false, diff --git a/doc/source/tutorial/tutorial_miscellaneous.ipyml b/doc/source/tutorial/tutorial_miscellaneous.ipyml index c8c2ae69a..216668502 100644 --- a/doc/source/tutorial/tutorial_miscellaneous.ipyml +++ b/doc/source/tutorial/tutorial_miscellaneous.ipyml @@ -1,7 +1,7 @@ cells: - markdown: | - # Miscellaneous (other interesting array functions) + # Some Useful Functions - code: | @@ -9,7 +9,6 @@ cells: import warnings warnings.filterwarnings("ignore", message=r'.*numpy.dtype size changed*') - id: 0 metadata: nbsphinx: hidden @@ -20,7 +19,6 @@ cells: - code: | from larray import * - id: 1 - markdown: | Check the version of LArray: @@ -30,120 +28,114 @@ cells: from larray import __version__ __version__ - id: 2 - -- markdown: | - Import a subset of the test array ``pop``: - - code: | - # import a 6 x 2 x 2 subset of the 'pop' example array - pop = load_example_data('demography').pop[2016, 'BruCap', 100:105] + # load 'demography_eurostat' dataset + demo_eurostat = load_example_data('demography_eurostat') + + # extract the 'pop' array from the dataset + pop = demo_eurostat.pop pop - id: 3 - markdown: | ### with total - Add totals to one axis + Add totals to one or several axes: - code: | - pop.with_total('sex', label='B') + pop.with_total('gender', label='Total') - id: 4 - markdown: | - Add totals to all axes at once + See [with_total](../_generated/larray.LArray.with_total.rst#larray.LArray.with_total) for more details and examples. -- code: | - # by default label is 'total' - pop.with_total() - - id: 5 - - markdown: | ### where - where can be used to apply some computation depending on a condition + The ``where`` function can be used to apply some computation depending on a condition: - code: | # where(condition, value if true, value if false) - where(pop < 10, 0, -pop) + where(pop < pop.mean('time'), -pop, pop) + + +- markdown: | + See [where](../_generated/larray.where.rst#larray.where) for more details and examples. - id: 6 - markdown: | ### clip - Set all data between a certain range + Set all data between a certain range: - code: | - # clip(min, max) - # values below 10 are set to 10 and values above 50 are set to 50 - pop.clip(10, 50) + # values below 10 millions are set to 10 millions + pop.clip(minval=10**7) - id: 7 -- markdown: | - ### divnot0 - - Replace division by 0 to 0 +- code: | + # values above 40 millions are set to 40 millions + pop.clip(maxval=4*10**7) - code: | - pop['BE'] / pop['FO'] + # values below 10 millions are set to 10 millions and + # values above 40 millions are set to 40 millions + pop.clip(10**7, 4*10**7) - id: 8 -- code: | - # divnot0 replaces results of division by 0 by 0. - # Using it should be done with care though - # because it can hide a real error in your data. - pop['BE'].divnot0(pop['FO']) +- markdown: | + See [clip](../_generated/larray.LArray.clip.rst#larray.LArray.clip) for more details and examples. - id: 9 - markdown: | - ### diff + ### divnot0 - The ``diff`` method calculates the n-th order discrete difference along a given axis. - The first order difference is given by out[n+1] = in[n + 1] - in[n] along the given axis. + Replace division by 0 by 0: - code: | - pop = load_example_data('demography').pop[2005:2015, 'BruCap', 50] - pop + divisor = ones(pop.axes, dtype=int) + divisor['Male'] = 0 + divisor - id: 10 - code: | - # calculates 'pop[year+1] - pop[year]' - pop.diff('time') + pop / divisor - id: 11 - code: | - # calculates 'pop[year+2] - pop[year]' - pop.diff('time', d=2) + # we use astype(int) since the divnot0 method + # returns a float array in this case while + # we want an integer array + pop.divnot0(divisor).astype(int) + + +- markdown: | + See [divnot0](../_generated/larray.LArray.divnot0.rst#larray.LArray.divnot0) for more details and examples. - id: 12 - markdown: | ### ratio + + The ``ratio`` (``rationot0``) method returns an array with all values divided by the sum of values along given axes: - code: | - pop.ratio('nat') + pop.ratio('gender') # which is equivalent to - pop / pop.sum('nat') + pop / pop.sum('gender') + + +- markdown: | + See [ratio](../_generated/larray.LArray.ratio.rst#larray.LArray.ratio) and [rationot0](../_generated/larray.LArray.rationot0.rst#larray.LArray.rationot0) for more details and examples. - id: 13 - markdown: | ### percents @@ -151,20 +143,55 @@ cells: - code: | # or, if you want the previous ratios in percents - pop.percent('nat') + pop.percent('gender') + + +- markdown: | + See [percent](../_generated/larray.LArray.percent.rst#larray.LArray.percent) for more details and examples. + + +- markdown: | + ### diff + + The ``diff`` method calculates the n-th order discrete difference along a given axis. + + The first order difference is given by ``out[n+1] = in[n+1] - in[n]`` along the given axis. + + +- code: | + # calculates 'diff[year+1] = pop[year+1] - pop[year]' + pop.diff('time') + + +- code: | + # calculates 'diff[year+2] = pop[year+2] - pop[year]' + pop.diff('time', d=2) + + +- code: | + # calculates 'diff[year] = pop[year+1] - pop[year]' + pop.diff('time', label='lower') + + +- markdown: | + See [diff](../_generated/larray.LArray.diff.rst#larray.LArray.diff) for more details and examples. - id: 14 - markdown: | ### growth\_rate - using the same principle than `diff` + The ``growth_rate`` method calculates the growth along a given axis. + + It is roughly equivalent to ``a.diff(axis, d, label) / a[axis.i[:-d]]``: - code: | pop.growth_rate('time') - id: 15 + +- markdown: | + See [growth_rate](../_generated/larray.LArray.growth_rate.rst#larray.LArray.growth_rate) for more details and examples. + - markdown: | ### shift @@ -175,32 +202,22 @@ cells: - code: | pop.shift('time') - id: 16 - code: | # when shift is applied on an (increasing) time axis, # it effectively brings "past" data into the future - pop.shift('time').ignore_labels('time') == pop[2005:2014].ignore_labels('time') + pop_shifted = pop.shift('time') + stack({'pop_shifted_2014': pop_shifted[2014], 'pop_2013': pop[2013]}, 'array') - id: 17 -- code: | - # this is mostly useful when you want to do operations between the past and now - # as an example, here is an alternative implementation of the .diff method seen above: - pop.i[1:] - pop.shift('time') +- markdown: | + See [shift](../_generated/larray.LArray.shift.rst#larray.LArray.shift) for more details and examples. - id: 18 - markdown: | - ### Misc other interesting functions + ### Other interesting functions - There are a lot more interesting functions available: - - - round, floor, ceil, trunc, - - exp, log, log10, - - sqrt, absolute, nan_to_num, isnan, isinf, inverse, - - sin, cos, tan, arcsin, arccos, arctan - - and many many more... + There are a lot more interesting functions that you can find in the API reference in sections [Aggregation Functions](../api.rst#aggregation-functions), [Miscellaneous](../api.rst#miscellaneous) and [Utility Functions](../api.rst#utility-functions). # The lines below here may be deleted if you do not need them. @@ -220,21 +237,10 @@ metadata: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.4 + version: 3.7.3 livereveal: autolaunch: false scroll: true nbformat: 4 nbformat_minor: 2 -# --------------------------------------------------------------------------- -data: - [{execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}] - diff --git a/doc/source/tutorial/tutorial_miscellaneous.ipynb b/doc/source/tutorial/tutorial_miscellaneous.ipynb index 2a723fe8b..78f5c4251 100644 --- a/doc/source/tutorial/tutorial_miscellaneous.ipynb +++ b/doc/source/tutorial/tutorial_miscellaneous.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Miscellaneous (other interesting array functions)\n" + "# Some Useful Functions\n" ] }, { @@ -53,21 +53,17 @@ "__version__" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Import a subset of the test array ``pop``:\n" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# import a 6 x 2 x 2 subset of the 'pop' example array\n", - "pop = load_example_data('demography').pop[2016, 'BruCap', 100:105]\n", + "# load 'demography_eurostat' dataset\n", + "demo_eurostat = load_example_data('demography_eurostat')\n", + "\n", + "# extract the 'pop' array from the dataset \n", + "pop = demo_eurostat.pop\n", "pop" ] }, @@ -77,7 +73,7 @@ "source": [ "### with total\n", "\n", - "Add totals to one axis\n" + "Add totals to one or several axes:\n" ] }, { @@ -86,14 +82,23 @@ "metadata": {}, "outputs": [], "source": [ - "pop.with_total('sex', label='B')" + "pop.with_total('gender', label='Total')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Add totals to all axes at once\n" + "See [with_total](../_generated/larray.LArray.with_total.rst#larray.LArray.with_total) for more details and examples.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### where\n", + "\n", + "The ``where`` function can be used to apply some computation depending on a condition:\n" ] }, { @@ -102,17 +107,24 @@ "metadata": {}, "outputs": [], "source": [ - "# by default label is 'total'\n", - "pop.with_total()" + "# where(condition, value if true, value if false)\n", + "where(pop < pop.mean('time'), -pop, pop)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### where\n", + "See [where](../_generated/larray.where.rst#larray.where) for more details and examples.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### clip\n", "\n", - "where can be used to apply some computation depending on a condition\n" + "Set all data between a certain range:\n" ] }, { @@ -121,17 +133,18 @@ "metadata": {}, "outputs": [], "source": [ - "# where(condition, value if true, value if false)\n", - "where(pop < 10, 0, -pop)" + "# values below 10 millions are set to 10 millions\n", + "pop.clip(minval=10**7)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "### clip\n", - "\n", - "Set all data between a certain range\n" + "# values above 40 millions are set to 40 millions\n", + "pop.clip(maxval=4*10**7)" ] }, { @@ -140,9 +153,16 @@ "metadata": {}, "outputs": [], "source": [ - "# clip(min, max)\n", - "# values below 10 are set to 10 and values above 50 are set to 50\n", - "pop.clip(10, 50)" + "# values below 10 millions are set to 10 millions and \n", + "# values above 40 millions are set to 40 millions\n", + "pop.clip(10**7, 4*10**7)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [clip](../_generated/larray.LArray.clip.rst#larray.LArray.clip) for more details and examples.\n" ] }, { @@ -151,7 +171,7 @@ "source": [ "### divnot0\n", "\n", - "Replace division by 0 to 0\n" + "Replace division by 0 by 0:\n" ] }, { @@ -160,7 +180,9 @@ "metadata": {}, "outputs": [], "source": [ - "pop['BE'] / pop['FO']" + "divisor = ones(pop.axes, dtype=int)\n", + "divisor['Male'] = 0\n", + "divisor" ] }, { @@ -169,20 +191,35 @@ "metadata": {}, "outputs": [], "source": [ - "# divnot0 replaces results of division by 0 by 0.\n", - "# Using it should be done with care though\n", - "# because it can hide a real error in your data.\n", - "pop['BE'].divnot0(pop['FO'])" + "pop / divisor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# we use astype(int) since the divnot0 method \n", + "# returns a float array in this case while \n", + "# we want an integer array\n", + "pop.divnot0(divisor).astype(int)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### diff\n", + "See [divnot0](../_generated/larray.LArray.divnot0.rst#larray.LArray.divnot0) for more details and examples.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ratio\n", "\n", - "The ``diff`` method calculates the n-th order discrete difference along a given axis.\n", - "The first order difference is given by out[n+1] = in[n + 1] - in[n] along the given axis.\n" + "The ``ratio`` (``rationot0``) method returns an array with all values divided by the sum of values along given axes:" ] }, { @@ -191,18 +228,24 @@ "metadata": {}, "outputs": [], "source": [ - "pop = load_example_data('demography').pop[2005:2015, 'BruCap', 50]\n", - "pop" + "pop.ratio('gender')\n", + "\n", + "# which is equivalent to\n", + "pop / pop.sum('gender')" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# calculates 'pop[year+1] - pop[year]'\n", - "pop.diff('time')" + "See [ratio](../_generated/larray.LArray.ratio.rst#larray.LArray.ratio) and [rationot0](../_generated/larray.LArray.rationot0.rst#larray.LArray.rationot0) for more details and examples.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### percents\n" ] }, { @@ -211,15 +254,26 @@ "metadata": {}, "outputs": [], "source": [ - "# calculates 'pop[year+2] - pop[year]'\n", - "pop.diff('time', d=2)" + "# or, if you want the previous ratios in percents\n", + "pop.percent('gender')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### ratio\n" + "See [percent](../_generated/larray.LArray.percent.rst#larray.LArray.percent) for more details and examples.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### diff\n", + "\n", + "The ``diff`` method calculates the n-th order discrete difference along a given axis.\n", + "\n", + "The first order difference is given by ``out[n+1] = in[n+1] - in[n]`` along the given axis.\n" ] }, { @@ -228,17 +282,18 @@ "metadata": {}, "outputs": [], "source": [ - "pop.ratio('nat')\n", - "\n", - "# which is equivalent to\n", - "pop / pop.sum('nat')" + "# calculates 'diff[year+1] = pop[year+1] - pop[year]'\n", + "pop.diff('time')" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "### percents\n" + "# calculates 'diff[year+2] = pop[year+2] - pop[year]'\n", + "pop.diff('time', d=2)" ] }, { @@ -247,8 +302,15 @@ "metadata": {}, "outputs": [], "source": [ - "# or, if you want the previous ratios in percents\n", - "pop.percent('nat')" + "# calculates 'diff[year] = pop[year+1] - pop[year]'\n", + "pop.diff('time', label='lower')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [diff](../_generated/larray.LArray.diff.rst#larray.LArray.diff) for more details and examples.\n" ] }, { @@ -257,7 +319,9 @@ "source": [ "### growth\\_rate\n", "\n", - "using the same principle than `diff`\n" + "The ``growth_rate`` method calculates the growth along a given axis.\n", + " \n", + "It is roughly equivalent to ``a.diff(axis, d, label) / a[axis.i[:-d]]``:\n" ] }, { @@ -269,6 +333,13 @@ "pop.growth_rate('time')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [growth_rate](../_generated/larray.LArray.growth_rate.rst#larray.LArray.growth_rate) for more details and examples.\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -295,33 +366,24 @@ "source": [ "# when shift is applied on an (increasing) time axis,\n", "# it effectively brings \"past\" data into the future\n", - "pop.shift('time').ignore_labels('time') == pop[2005:2014].ignore_labels('time')" + "pop_shifted = pop.shift('time')\n", + "stack({'pop_shifted_2014': pop_shifted[2014], 'pop_2013': pop[2013]}, 'array')" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# this is mostly useful when you want to do operations between the past and now\n", - "# as an example, here is an alternative implementation of the .diff method seen above:\n", - "pop.i[1:] - pop.shift('time')" + "See [shift](../_generated/larray.LArray.shift.rst#larray.LArray.shift) for more details and examples.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Misc other interesting functions\n", - "\n", - "There are a lot more interesting functions available:\n", + "### Other interesting functions\n", "\n", - "- round, floor, ceil, trunc,\n", - "- exp, log, log10,\n", - "- sqrt, absolute, nan_to_num, isnan, isinf, inverse,\n", - "- sin, cos, tan, arcsin, arccos, arctan\n", - "- and many many more...\n" + "There are a lot more interesting functions that you can find in the API reference in sections [Aggregation Functions](../api.rst#aggregation-functions), [Miscellaneous](../api.rst#miscellaneous) and [Utility Functions](../api.rst#utility-functions).\n" ] } ], @@ -342,7 +404,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.7.3" }, "livereveal": { "autolaunch": false, diff --git a/doc/source/tutorial/tutorial_plotting.ipyml b/doc/source/tutorial/tutorial_plotting.ipyml index 4cf5bdb77..2c3272daa 100644 --- a/doc/source/tutorial/tutorial_plotting.ipyml +++ b/doc/source/tutorial/tutorial_plotting.ipyml @@ -9,7 +9,6 @@ cells: import warnings warnings.filterwarnings("ignore", message=r'.*numpy.dtype size changed*') - id: 0 metadata: nbsphinx: hidden @@ -20,7 +19,6 @@ cells: - code: | from larray import * - id: 1 - markdown: | Check the version of LArray: @@ -30,27 +28,27 @@ cells: from larray import __version__ __version__ - id: 2 - markdown: | Import a subset of the test array ``pop``: - code: | - # import a 6 x 2 x 2 subset of the 'pop' example array - pop = load_example_data('demography').pop[2016, 'BruCap', 90:95] + # load 'demography_eurostat' dataset + demo_eurostat = load_example_data('demography_eurostat') + + # extract the 'pop' array from the dataset + pop = demo_eurostat.pop pop - id: 3 - markdown: | - Inline matplotlib: + Inline matplotlib (required in notebooks): - code: | - %matplotlib inline + %matplotlib inline - id: 4 - markdown: | Create a plot (last axis define the different curves to draw): @@ -59,13 +57,10 @@ cells: - code: | pop.plot() - id: 5 -- code: | - # plot total of both sex - pop.sum('sex').plot() +- markdown: | + See [plot](../_generated/larray.LArray.plot.rst#larray.LArray.plot) for more details and examples. - id: 6 # The lines below here may be deleted if you do not need them. # --------------------------------------------------------------------------- @@ -84,16 +79,10 @@ metadata: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.4 + version: 3.7.3 livereveal: autolaunch: false scroll: true nbformat: 4 nbformat_minor: 2 -# --------------------------------------------------------------------------- -data: - [{execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}] - diff --git a/doc/source/tutorial/tutorial_plotting.ipynb b/doc/source/tutorial/tutorial_plotting.ipynb index b3867582d..d7d37d764 100644 --- a/doc/source/tutorial/tutorial_plotting.ipynb +++ b/doc/source/tutorial/tutorial_plotting.ipynb @@ -66,8 +66,11 @@ "metadata": {}, "outputs": [], "source": [ - "# import a 6 x 2 x 2 subset of the 'pop' example array\n", - "pop = load_example_data('demography').pop[2016, 'BruCap', 90:95]\n", + "# load 'demography_eurostat' dataset\n", + "demo_eurostat = load_example_data('demography_eurostat')\n", + "\n", + "# extract the 'pop' array from the dataset \n", + "pop = demo_eurostat.pop\n", "pop" ] }, @@ -75,7 +78,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Inline matplotlib:\n" + "Inline matplotlib (required in notebooks):\n" ] }, { @@ -84,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "%matplotlib inline " + "%matplotlib inline" ] }, { @@ -104,13 +107,10 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# plot total of both sex\n", - "pop.sum('sex').plot()" + "See [plot](../_generated/larray.LArray.plot.rst#larray.LArray.plot) for more details and examples." ] } ], @@ -131,7 +131,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.7.3" }, "livereveal": { "autolaunch": false, diff --git a/doc/source/tutorial/tutorial_presenting_larray_objects.ipyml b/doc/source/tutorial/tutorial_presenting_larray_objects.ipyml index bf2c9680a..ff4342375 100644 --- a/doc/source/tutorial/tutorial_presenting_larray_objects.ipyml +++ b/doc/source/tutorial/tutorial_presenting_larray_objects.ipyml @@ -330,7 +330,7 @@ cells: # add axes to the session gender = Axis("gender=Male,Female") s_pop.gender = gender - time = Axis("time=2013,2014,2015") + time = Axis("time=2013..2017") s_pop.time = time # add arrays to the session @@ -352,7 +352,7 @@ cells: - code: | gender = Axis("gender=Male,Female") - time = Axis("time=2013,2014,2015") + time = Axis("time=2013..2017") # create and populate a new session in one step # Python <= 3.5 @@ -411,7 +411,7 @@ metadata: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.8 + version: 3.7.3 livereveal: autolaunch: false scroll: true diff --git a/doc/source/tutorial/tutorial_presenting_larray_objects.ipynb b/doc/source/tutorial/tutorial_presenting_larray_objects.ipynb index 55af251d7..c98ffb444 100644 --- a/doc/source/tutorial/tutorial_presenting_larray_objects.ipynb +++ b/doc/source/tutorial/tutorial_presenting_larray_objects.ipynb @@ -536,7 +536,7 @@ "# add axes to the session\n", "gender = Axis(\"gender=Male,Female\")\n", "s_pop.gender = gender\n", - "time = Axis(\"time=2013,2014,2015\")\n", + "time = Axis(\"time=2013..2017\")\n", "s_pop.time = time\n", "\n", "# add arrays to the session\n", @@ -566,7 +566,7 @@ "outputs": [], "source": [ "gender = Axis(\"gender=Male,Female\")\n", - "time = Axis(\"time=2013,2014,2015\")\n", + "time = Axis(\"time=2013..2017\")\n", "\n", "# create and populate a new session in one step\n", "# Python <= 3.5\n", @@ -632,7 +632,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" }, "livereveal": { "autolaunch": false, diff --git a/doc/source/tutorial/tutorial_sessions.ipyml b/doc/source/tutorial/tutorial_sessions.ipyml index 8efdd2cb0..77e2a954a 100644 --- a/doc/source/tutorial/tutorial_sessions.ipyml +++ b/doc/source/tutorial/tutorial_sessions.ipyml @@ -43,7 +43,7 @@ cells: - code: | # load a session representing the results of a demographic model - filepath_hdf = get_example_filepath('population_session.h5') + filepath_hdf = get_example_filepath('demography_eurostat.h5') s_pop = Session(filepath_hdf) # print the content of the session @@ -62,13 +62,27 @@ cells: - markdown: | ### Selecting And Filtering Items - To select an item, simply use the syntax ``.``: + Session objects work like ordinary ``dict`` Python objects. To select an item, use the usual syntax ``['']``: + + +- code: | + s_pop['pop'] + + +- markdown: | + A simpler way consists in the use the syntax ``.``: - code: | s_pop.pop +- markdown: | +
+ **Warning:** The syntax ``session_var.item_name`` will work as long as you don't use any special character like ``, ; :`` in the item's name. +
+ + - markdown: | To return a new session with selected items, use the syntax ``[list, of, item, names]``: @@ -93,6 +107,39 @@ cells: s_pop.filter(pattern='[a-k]*') +- markdown: | + ### Iterating over Items + + Like the built-in Python ``dict`` objects, Session objects provide methods to iterate over items: + + +- code: | + # iterate over item names + for key in s_pop.keys(): + print(key) + + +- code: | + # iterate over items + for value in s_pop.values(): + if isinstance(value, LArray): + print(value.info) + else: + print(repr(value)) + print() + + +- code: | + # iterate over names and items + for key, value in s_pop.items(): + if isinstance(value, LArray): + print(key, ':') + print(value.info) + else: + print(key, ':', repr(value)) + print() + + - markdown: | ### Arithmetic Operations On Sessions @@ -112,13 +159,13 @@ cells: - code: | from larray import random - random_multiplicator = random.choice([0.98, 1.0, 1.02], p=[0.15, 0.7, 0.15], axes=s_pop.pop.axes) - random_multiplicator + random_increment = random.choice([-1, 0, 1], p=[0.3, 0.4, 0.3], axes=s_pop.pop.axes) * 1000 + random_increment - code: | - # multiply all variables of a session by a common array - s_pop_rand = s_pop * random_multiplicator + # add some variables of a session by a common array + s_pop_rand = s_pop['pop', 'births', 'deaths'] + random_increment s_pop_rand.pop @@ -141,17 +188,21 @@ cells: - code: | - # force conversion to type int - def as_type_int(array): - return array.astype(int) + # add the next year to all arrays + def add_next_year(array): + if 'time' in array.axes.names: + last_year = array.time.i[-1] + return array.append('time', 0, last_year + 1) + else: + return array - s_pop_rand_int = s_pop_rand.apply(as_type_int) + s_pop_with_next_year = s_pop.apply(add_next_year) print('pop array before calling apply:') - print(s_pop_rand.pop) + print(s_pop.pop) print() print('pop array after calling apply:') - print(s_pop_rand_int.pop) + print(s_pop_with_next_year.pop) - markdown: | @@ -159,15 +210,24 @@ cells: - code: | - # passing the LArray.astype method directly with argument - # dtype defined as int - s_pop_rand_int = s_pop_rand.apply(LArray.astype, dtype=int) + # add the next year to all arrays. + # Use the 'copy_values_from_last_year flag' to indicate + # whether or not to copy values from the last year + def add_next_year(array, copy_values_from_last_year): + if 'time' in array.axes.names: + last_year = array.time.i[-1] + value = array[last_year] if copy_values_from_last_year else 0 + return array.append('time', value, last_year + 1) + else: + return array + + s_pop_with_next_year = s_pop.apply(add_next_year, True) print('pop array before calling apply:') - print(s_pop_rand.pop) + print(s_pop.pop) print() print('pop array after calling apply:') - print(s_pop_rand_int.pop) + print(s_pop_with_next_year.pop) - markdown: | @@ -181,54 +241,62 @@ cells: - markdown: | - [Session objects](../api.rst#session) provide the two methods to compare two sessions: [equals](../_generated/larray.Session.equals.rst#larray.Session.equals) and [element_equals](../_generated/larray.Session.element_equals.rst#larray.Session.element_equals). + [Session objects](../api.rst#session) provide the two methods to compare two sessions: [equals](../_generated/larray.Session.equals.rst#larray.Session.equals) and [element_equals](../_generated/larray.Session.element_equals.rst#larray.Session.element_equals): - The ``equals`` method will return True if **all items** from both sessions are identical, False otherwise: + - The ``equals`` method will return True if **all items** from both sessions are identical, False otherwise. + - The ``element_equals`` method will compare items of two sessions one by one and return an array of boolean values. - code: | # load a session representing the results of a demographic model - filepath_hdf = get_example_filepath('population_session.h5') + filepath_hdf = get_example_filepath('demography_eurostat.h5') s_pop = Session(filepath_hdf) # create a copy of the original session - s_pop_copy = Session(filepath_hdf) - + s_pop_copy = s_pop.copy() + + +- code: | + # 'element_equals' compare arrays one by one + s_pop.element_equals(s_pop_copy) + + +- code: | # 'equals' returns True if all items of the two sessions have exactly the same items s_pop.equals(s_pop_copy) - code: | - # create a copy of the original session but with the array - # 'births' slightly modified for some labels combination - s_pop_alternative = Session(filepath_hdf) - s_pop_alternative.births *= random_multiplicator - - # 'equals' returns False if at least on item of the two sessions are different in values or axes - s_pop.equals(s_pop_alternative) + # slightly modify the 'pop' array for some labels combination + s_pop_copy.pop += random_increment - code: | - # add an array to the session - s_pop_new_output = Session(filepath_hdf) - s_pop_new_output.gender_ratio = s_pop_new_output.pop.ratio('gender') - - # 'equals' returns False if at least on item is not present in the two sessions - s_pop.equals(s_pop_new_output) + # the 'pop' array is different between the two sessions + s_pop.element_equals(s_pop_copy) -- markdown: | - The ``element_equals`` method will compare items of two sessions one by one and return an array of boolean values: +- code: | + # 'equals' returns False if at least one item of the two sessions are different in values or axes + s_pop.equals(s_pop_copy) - code: | - # 'element_equals' compare arrays one by one + # reset the 'copy' session as a copy of the original session + s_pop_copy = s_pop.copy() + + # add an array to the 'copy' session + s_pop_copy.gender_ratio = s_pop_copy.pop.ratio('gender') + + +- code: | + # the 'gender_ratio' array is not present in the original session s_pop.element_equals(s_pop_copy) - code: | - # array 'births' is different between the two sessions - s_pop.element_equals(s_pop_alternative) + # 'equals' returns False if at least one item is not present in the two sessions + s_pop.equals(s_pop_copy) - markdown: | @@ -236,9 +304,17 @@ cells: - code: | - s_same_values = s_pop == s_pop_alternative + # reset the 'copy' session as a copy of the original session + s_pop_copy = s_pop.copy() + + # slightly modify the 'pop' array for some labels combination + s_pop_copy.pop += random_increment + + +- code: | + s_check_same_values = s_pop == s_pop_copy - s_same_values.births + s_check_same_values.pop - markdown: | @@ -246,7 +322,7 @@ cells: - code: | - s_same_values.country + s_check_same_values.time - markdown: | @@ -254,9 +330,9 @@ cells: - code: | - s_different_values = s_pop != s_pop_alternative + s_check_different_values = s_pop != s_pop_copy - s_different_values.births + s_check_different_values.pop - markdown: | @@ -292,7 +368,7 @@ metadata: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.4 + version: 3.7.3 livereveal: autolaunch: false scroll: true diff --git a/doc/source/tutorial/tutorial_sessions.ipynb b/doc/source/tutorial/tutorial_sessions.ipynb index 4ff690d52..7b5bac662 100644 --- a/doc/source/tutorial/tutorial_sessions.ipynb +++ b/doc/source/tutorial/tutorial_sessions.ipynb @@ -78,7 +78,7 @@ "outputs": [], "source": [ "# load a session representing the results of a demographic model\n", - "filepath_hdf = get_example_filepath('population_session.h5')\n", + "filepath_hdf = get_example_filepath('demography_eurostat.h5')\n", "s_pop = Session(filepath_hdf)\n", "\n", "# print the content of the session\n", @@ -108,7 +108,23 @@ "source": [ "### Selecting And Filtering Items\n", "\n", - "To select an item, simply use the syntax ``.``:" + "Session objects work like ordinary ``dict`` Python objects. To select an item, use the usual syntax ``['']``: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s_pop['pop']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A simpler way consists in the use the syntax ``.``:" ] }, { @@ -120,6 +136,15 @@ "s_pop.pop" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " **Warning:** The syntax ``session_var.item_name`` will work as long as you don't use any special character like ``, ; :`` in the item's name.\n", + "
" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -165,6 +190,57 @@ "s_pop.filter(pattern='[a-k]*')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Iterating over Items\n", + "\n", + "Like the built-in Python ``dict`` objects, Session objects provide methods to iterate over items: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# iterate over item names\n", + "for key in s_pop.keys():\n", + " print(key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# iterate over items\n", + "for value in s_pop.values():\n", + " if isinstance(value, LArray):\n", + " print(value.info)\n", + " else:\n", + " print(repr(value))\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# iterate over names and items\n", + "for key, value in s_pop.items():\n", + " if isinstance(value, LArray):\n", + " print(key, ':')\n", + " print(value.info)\n", + " else:\n", + " print(key, ':', repr(value))\n", + " print()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -200,8 +276,8 @@ "outputs": [], "source": [ "from larray import random\n", - "random_multiplicator = random.choice([0.98, 1.0, 1.02], p=[0.15, 0.7, 0.15], axes=s_pop.pop.axes)\n", - "random_multiplicator" + "random_increment = random.choice([-1, 0, 1], p=[0.3, 0.4, 0.3], axes=s_pop.pop.axes) * 1000\n", + "random_increment" ] }, { @@ -210,8 +286,8 @@ "metadata": {}, "outputs": [], "source": [ - "# multiply all variables of a session by a common array\n", - "s_pop_rand = s_pop * random_multiplicator\n", + "# add some variables of a session by a common array\n", + "s_pop_rand = s_pop['pop', 'births', 'deaths'] + random_increment\n", "\n", "s_pop_rand.pop" ] @@ -250,17 +326,21 @@ "metadata": {}, "outputs": [], "source": [ - "# force conversion to type int\n", - "def as_type_int(array):\n", - " return array.astype(int)\n", + "# add the next year to all arrays\n", + "def add_next_year(array):\n", + " if 'time' in array.axes.names:\n", + " last_year = array.time.i[-1] \n", + " return array.append('time', 0, last_year + 1)\n", + " else:\n", + " return array\n", "\n", - "s_pop_rand_int = s_pop_rand.apply(as_type_int)\n", + "s_pop_with_next_year = s_pop.apply(add_next_year)\n", "\n", "print('pop array before calling apply:')\n", - "print(s_pop_rand.pop)\n", + "print(s_pop.pop)\n", "print()\n", "print('pop array after calling apply:')\n", - "print(s_pop_rand_int.pop)" + "print(s_pop_with_next_year.pop)" ] }, { @@ -276,15 +356,24 @@ "metadata": {}, "outputs": [], "source": [ - "# passing the LArray.astype method directly with argument \n", - "# dtype defined as int\n", - "s_pop_rand_int = s_pop_rand.apply(LArray.astype, dtype=int)\n", + "# add the next year to all arrays.\n", + "# Use the 'copy_values_from_last_year flag' to indicate \n", + "# whether or not to copy values from the last year\n", + "def add_next_year(array, copy_values_from_last_year):\n", + " if 'time' in array.axes.names:\n", + " last_year = array.time.i[-1]\n", + " value = array[last_year] if copy_values_from_last_year else 0\n", + " return array.append('time', value, last_year + 1)\n", + " else:\n", + " return array\n", + "\n", + "s_pop_with_next_year = s_pop.apply(add_next_year, True)\n", "\n", "print('pop array before calling apply:')\n", - "print(s_pop_rand.pop)\n", + "print(s_pop.pop)\n", "print()\n", "print('pop array after calling apply:')\n", - "print(s_pop_rand_int.pop)" + "print(s_pop_with_next_year.pop)" ] }, { @@ -307,9 +396,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[Session objects](../api.rst#session) provide the two methods to compare two sessions: [equals](../_generated/larray.Session.equals.rst#larray.Session.equals) and [element_equals](../_generated/larray.Session.element_equals.rst#larray.Session.element_equals).\n", + "[Session objects](../api.rst#session) provide the two methods to compare two sessions: [equals](../_generated/larray.Session.equals.rst#larray.Session.equals) and [element_equals](../_generated/larray.Session.element_equals.rst#larray.Session.element_equals):\n", "\n", - "The ``equals`` method will return True if **all items** from both sessions are identical, False otherwise:" + "- The ``equals`` method will return True if **all items** from both sessions are identical, False otherwise.\n", + "- The ``element_equals`` method will compare items of two sessions one by one and return an array of boolean values." ] }, { @@ -319,12 +409,29 @@ "outputs": [], "source": [ "# load a session representing the results of a demographic model\n", - "filepath_hdf = get_example_filepath('population_session.h5')\n", + "filepath_hdf = get_example_filepath('demography_eurostat.h5')\n", "s_pop = Session(filepath_hdf)\n", "\n", "# create a copy of the original session\n", - "s_pop_copy = Session(filepath_hdf)\n", - "\n", + "s_pop_copy = s_pop.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 'element_equals' compare arrays one by one\n", + "s_pop.element_equals(s_pop_copy)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# 'equals' returns True if all items of the two sessions have exactly the same items\n", "s_pop.equals(s_pop_copy)" ] @@ -335,13 +442,8 @@ "metadata": {}, "outputs": [], "source": [ - "# create a copy of the original session but with the array\n", - "# 'births' slightly modified for some labels combination\n", - "s_pop_alternative = Session(filepath_hdf)\n", - "s_pop_alternative.births *= random_multiplicator\n", - "\n", - "# 'equals' returns False if at least on item of the two sessions are different in values or axes\n", - "s_pop.equals(s_pop_alternative)" + "# slightly modify the 'pop' array for some labels combination\n", + "s_pop_copy.pop += random_increment " ] }, { @@ -350,19 +452,18 @@ "metadata": {}, "outputs": [], "source": [ - "# add an array to the session\n", - "s_pop_new_output = Session(filepath_hdf)\n", - "s_pop_new_output.gender_ratio = s_pop_new_output.pop.ratio('gender')\n", - "\n", - "# 'equals' returns False if at least on item is not present in the two sessions\n", - "s_pop.equals(s_pop_new_output)" + "# the 'pop' array is different between the two sessions\n", + "s_pop.element_equals(s_pop_copy)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "The ``element_equals`` method will compare items of two sessions one by one and return an array of boolean values:" + "# 'equals' returns False if at least one item of the two sessions are different in values or axes\n", + "s_pop.equals(s_pop_copy)" ] }, { @@ -371,7 +472,20 @@ "metadata": {}, "outputs": [], "source": [ - "# 'element_equals' compare arrays one by one\n", + "# reset the 'copy' session as a copy of the original session\n", + "s_pop_copy = s_pop.copy()\n", + "\n", + "# add an array to the 'copy' session\n", + "s_pop_copy.gender_ratio = s_pop_copy.pop.ratio('gender')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# the 'gender_ratio' array is not present in the original session\n", "s_pop.element_equals(s_pop_copy)" ] }, @@ -381,8 +495,8 @@ "metadata": {}, "outputs": [], "source": [ - "# array 'births' is different between the two sessions\n", - "s_pop.element_equals(s_pop_alternative)" + "# 'equals' returns False if at least one item is not present in the two sessions\n", + "s_pop.equals(s_pop_copy)" ] }, { @@ -398,9 +512,22 @@ "metadata": {}, "outputs": [], "source": [ - "s_same_values = s_pop == s_pop_alternative\n", + "# reset the 'copy' session as a copy of the original session\n", + "s_pop_copy = s_pop.copy()\n", + "\n", + "# slightly modify the 'pop' array for some labels combination\n", + "s_pop_copy.pop += random_increment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s_check_same_values = s_pop == s_pop_copy\n", "\n", - "s_same_values.births" + "s_check_same_values.pop" ] }, { @@ -416,7 +543,7 @@ "metadata": {}, "outputs": [], "source": [ - "s_same_values.country" + "s_check_same_values.time" ] }, { @@ -432,9 +559,9 @@ "metadata": {}, "outputs": [], "source": [ - "s_different_values = s_pop != s_pop_alternative\n", + "s_check_different_values = s_pop != s_pop_copy\n", "\n", - "s_different_values.births " + "s_check_different_values.pop" ] }, { @@ -477,7 +604,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.7.3" }, "livereveal": { "autolaunch": false, diff --git a/doc/source/tutorial/tutorial_string_syntax.ipyml b/doc/source/tutorial/tutorial_string_syntax.ipyml new file mode 100644 index 000000000..685eba2e2 --- /dev/null +++ b/doc/source/tutorial/tutorial_string_syntax.ipyml @@ -0,0 +1,162 @@ +cells: + +- markdown: | + # Pythonic VS String Syntax + + +- markdown: | + Import the LArray library: + + +- code: | + from larray import * + + +- markdown: | + Check the version of LArray: + + +- code: | + from larray import __version__ + __version__ + + +- markdown: | + LArray offers two syntaxes to build axes and make selections and aggregations. + The first one is more ``Pythonic`` (uses Python structures) + For example, you can create an *age_category* axis as follows: + + +- code: | + age_category = Axis(["0-9", "10-17", "18-66", "67+"], "age_category") + age_category + + +- markdown: | + The second one consists of using ``strings`` that are parsed. + It is shorter to type. The same *age_category* axis could have been generated as follows: + + +- code: | + age_category = Axis("age_category=0-9,10-17,18-66,67+") + age_category + + +- markdown: | +
+ **Warning:** The drawback of the string syntax is that some characters such as `, ; = : .. [ ] >>` + have a special meaning and cannot be used with the ``String`` syntax. + If you need to work with labels containing such special characters (when importing data from an external source for example), you have to use the ``Pythonic`` syntax which allows to use any character in labels. +
+ + +- markdown: | + ## String Syntax + + +- markdown: | + ### Axes And Arrays creation + + The string syntax allows to easily create axes. + + When creating one axis, the labels are separated using ``,``: + + +- code: | + a = Axis('a=a0,a1,a2,a3') + a + + +- markdown: | + The special syntax ``start..stop`` generates a sequence of labels: + + +- code: | + a = Axis('a=a0..a3') + a + + +- markdown: | + When creating an array, it is possible to define several axes in the same string using ``;`` + + +- code: | + arr = zeros("a=a0..a2; b=b0,b1; c=c0..c5") + arr + + +- markdown: | + ### Selection + + Starting from the array: + + +- code: | + immigration = load_example_data('demography_eurostat').immigration + immigration.info + + +- markdown: | + an example of a selection using the ``Pythonic`` syntax is: + + +- code: | + # since the labels 'Belgium' and 'Netherlands' also exists in the 'citizenship' axis, + # we need to explicitly specify that we want to make a selection over the 'country' axis + immigration_subset = immigration[X.country['Belgium', 'Netherlands'], 'Female', 2015:] + immigration_subset + + +- markdown: | + Using the ``String`` syntax, the same selection becomes: + + +- code: | + immigration_subset = immigration['country[Belgium,Netherlands]', 'Female', 2015:] + immigration_subset + + +- markdown: | + ### Aggregation + + +- markdown: | + An example of an aggregation using the ``Pythonic`` syntax is: + + +- code: | + immigration.sum((X.time[2014::2] >> 'even_years', X.time[::2] >> 'odd_years'), 'citizenship') + + +- markdown: | + Using the ``String`` syntax, the same aggregation becomes: + + +- code: | + immigration.sum('time[2014::2] >> even_years; time[::2] >> odd_years', 'citizenship') + + +- markdown: | + where we used ``;`` to separate groups of labels from the same axis. + + +# The lines below here may be deleted if you do not need them. +# --------------------------------------------------------------------------- +metadata: + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.3 +nbformat: 4 +nbformat_minor: 2 + diff --git a/doc/source/tutorial/tutorial_string_syntax.ipynb b/doc/source/tutorial/tutorial_string_syntax.ipynb new file mode 100644 index 000000000..dcf0b9350 --- /dev/null +++ b/doc/source/tutorial/tutorial_string_syntax.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pythonic VS String Syntax" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import the LArray library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from larray import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the version of LArray:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from larray import __version__\n", + "__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "LArray offers two syntaxes to build axes and make selections and aggregations.\n", + "The first one is more ``Pythonic`` (uses Python structures) \n", + "For example, you can create an *age_category* axis as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "age_category = Axis([\"0-9\", \"10-17\", \"18-66\", \"67+\"], \"age_category\")\n", + "age_category" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The second one consists of using ``strings`` that are parsed.\n", + "It is shorter to type. The same *age_category* axis could have been generated as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "age_category = Axis(\"age_category=0-9,10-17,18-66,67+\")\n", + "age_category" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " **Warning:** The drawback of the string syntax is that some characters such as `, ; = : .. [ ] >>`\n", + "have a special meaning and cannot be used with the ``String`` syntax. \n", + "If you need to work with labels containing such special characters (when importing data from an external source for example), you have to use the ``Pythonic`` syntax which allows to use any character in labels.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## String Syntax" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Axes And Arrays creation\n", + "\n", + "The string syntax allows to easily create axes.\n", + "\n", + "When creating one axis, the labels are separated using ``,``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a = Axis('a=a0,a1,a2,a3')\n", + "a" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The special syntax ``start..stop`` generates a sequence of labels:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a = Axis('a=a0..a3')\n", + "a" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When creating an array, it is possible to define several axes in the same string using ``;``" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "arr = zeros(\"a=a0..a2; b=b0,b1; c=c0..c5\")\n", + "arr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Selection\n", + "\n", + "Starting from the array: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "immigration = load_example_data('demography_eurostat').immigration\n", + "immigration.info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "an example of a selection using the ``Pythonic`` syntax is:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# since the labels 'Belgium' and 'Netherlands' also exists in the 'citizenship' axis, \n", + "# we need to explicitly specify that we want to make a selection over the 'country' axis\n", + "immigration_subset = immigration[X.country['Belgium', 'Netherlands'], 'Female', 2015:]\n", + "immigration_subset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the ``String`` syntax, the same selection becomes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "immigration_subset = immigration['country[Belgium,Netherlands]', 'Female', 2015:]\n", + "immigration_subset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An example of an aggregation using the ``Pythonic`` syntax is:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "immigration.sum((X.time[2014::2] >> 'even_years', X.time[::2] >> 'odd_years'), 'citizenship')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the ``String`` syntax, the same aggregation becomes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "immigration.sum('time[2014::2] >> even_years; time[::2] >> odd_years', 'citizenship')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "where we used ``;`` to separate groups of labels from the same axis." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/doc/source/tutorial/tutorial_transforming.ipyml b/doc/source/tutorial/tutorial_transforming.ipyml index a52787edd..d14adcb46 100644 --- a/doc/source/tutorial/tutorial_transforming.ipyml +++ b/doc/source/tutorial/tutorial_transforming.ipyml @@ -9,7 +9,6 @@ cells: import warnings warnings.filterwarnings("ignore", message=r'.*numpy.dtype size changed*') - id: 0 metadata: nbsphinx: hidden @@ -20,7 +19,6 @@ cells: - code: | from larray import * - id: 1 - markdown: | Check the version of LArray: @@ -30,66 +28,109 @@ cells: from larray import __version__ __version__ - id: 2 -- markdown: | - ## Manipulating axes +- code: | + # load the 'demography_eurostat' dataset + demo_eurostat = load_example_data('demography_eurostat') - code: | - # let's start with - pop = load_example_data('demography').pop[2016, 'BruCap', 90:95] + # get a copy of the 'pop' array from the 'demography_eurostat' dataset + pop = demo_eurostat.pop.copy() pop - id: 3 + +- markdown: | + ## Manipulating axes + + + LArray offers several methods to manipulate the axes and labels of an array: + + - [set_labels](../_generated/larray.LArray.set_labels.rst#larray.LArray.set_labels): to replace all or some labels of one or several axes. + - [rename](../_generated/larray.LArray.rename.rst#larray.LArray.rename): to replace one or several axis names. + - [set_axes](../_generated/larray.LArray.set_axes.rst#larray.LArray.set_axes): to replace one or several axes. + - [transpose](../_generated/larray.LArray.transpose.rst#larray.LArray.transpose): to modify the order of axes. + - [drop](../_generated/larray.LArray.drop.rst#larray.LArray.drop): to remove one or several labels. + - [combine_axes](../_generated/larray.LArray.combine_axes.rst#larray.LArray.combine_axes): to combine axes. + - [split_axes](../_generated/larray.LArray.split_axes.rst#larray.LArray.split_axes): to split one or several axes by splitting their labels and names. + - [reindex](../_generated/larray.LArray.reindex.rst#larray.LArray.reindex): to reorder, add and remove labels of one or several axes. + - [insert](../_generated/larray.LArray.insert.rst#larray.LArray.insert): to insert a label at a given position. + - markdown: | ### Relabeling - Replace all labels of one axis + Replace some labels of an axis: + + +- code: | + # replace all labels of the 'gender' axis by passing the list of all new labels + pop_new_labels = pop.set_labels('gender', ['Men', 'Women']) + pop_new_labels - code: | - # returns a copy by default - pop_new_labels = pop.set_labels('sex', ['Men', 'Women']) + # set all labels of the 'country' axis to uppercase by passing the function str.upper() + pop_new_labels = pop.set_labels('country', str.upper) pop_new_labels - id: 4 - code: | - # inplace flag avoids to create a copy - pop.set_labels('sex', ['M', 'F'], inplace=True) + # replace only one label of the 'gender' axis by passing a dict + pop_new_labels = pop.set_labels('gender', {'Male': 'Men'}) + pop_new_labels + + +- markdown: | + See [set_labels](../_generated/larray.LArray.set_labels.rst#larray.LArray.set_labels) for more details and examples. - id: 5 - markdown: | ### Renaming axes - Rename one axis + Rename one axis: - code: | - pop.info + # 'rename' returns a copy of the array + pop_new_names = pop.rename('time', 'year') + pop_new_names + + +- markdown: | + Rename several axes at once: - id: 6 - code: | - # 'rename' returns a copy of the array - pop2 = pop.rename('sex', 'gender') - pop2 + pop_new_names = pop.rename({'gender': 'sex', 'time': 'year'}) + pop_new_names + + +- markdown: | + See [rename](../_generated/larray.LArray.rename.rst#larray.LArray.rename) for more details and examples. + + +- markdown: | + ### Replacing Axes + + Replace one axis: + + +- code: | + new_gender = Axis('sex=Men,Women') + pop_new_axis = pop.set_axes('gender', new_gender) + pop_new_axis - id: 7 - markdown: | - Rename several axes at once + Replace several axes at once: - code: | - # No x. here because sex and nat are keywords and not actual axes - pop2 = pop.rename(sex='gender', nat='nationality') - pop2 + new_country = Axis('country_codes=BE,FR,DE') + pop_new_axes = pop.set_axes({'country': new_country, 'gender': new_gender}) + pop_new_axes - id: 8 - markdown: | ### Reordering axes @@ -101,75 +142,238 @@ cells: - code: | - # starting order : age, sex, nat + # starting order : country, gender, time pop - id: 9 - code: | - # no argument --> reverse axes - pop.transpose() + # no argument --> reverse all axes + pop_transposed = pop.transpose() # .T is a shortcut for .transpose() - pop.T + pop_transposed = pop.T + + pop_transposed - id: 10 - code: | # reorder according to list - pop.transpose('age', 'nat', 'sex') + pop_transposed = pop.transpose('gender', 'country', 'time') + pop_transposed + + +- code: | + # move 'time' axis at first place + # not mentioned axes come after those which are mentioned (and keep their relative order) + pop_transposed = pop.transpose('time') + pop_transposed + + +- code: | + # move 'gender' axis at last place + # not mentioned axes come before those which are mentioned (and keep their relative order) + pop_transposed = pop.transpose(..., 'gender') + pop_transposed + + +- markdown: | + See [transpose](../_generated/larray.LArray.transpose.rst#larray.LArray.transpose) for more details and examples. + + +- markdown: | + ### Dropping Labels + + +- code: | + pop_labels_dropped = pop.drop([2014, 2016]) + pop_labels_dropped + + +- markdown: | + See [drop](../_generated/larray.LArray.drop.rst#larray.LArray.drop) for more details and examples. + + +- markdown: | + ### Combine And Split Axes + + Combine two axes: - id: 11 - code: | - # axes not mentioned come after those which are mentioned (and keep their relative order) - pop.transpose('sex') + pop_combined_axes = pop.combine_axes(('country', 'gender')) + pop_combined_axes + + +- markdown: | + Split an axis: + + +- code: | + pop_split_axes = pop_combined_axes.split_axes('country_gender') + pop_split_axes + + +- markdown: | + See [combine_axes](../_generated/larray.LArray.combine_axes.rst#larray.LArray.combine_axes) and [split_axes](../_generated/larray.LArray.split_axes.rst#larray.LArray.split_axes) for more details and examples. + + +- markdown: | + ### Reordering, adding and removing labels + + The ``reindex`` method allows to reorder, add and remove labels along one axis: + + +- code: | + # reverse years + remove 2013 + add 2018 + copy data for 2017 to 2018 + pop_new_time = pop.reindex('time', '2018..2014', fill_value=pop[2017]) + pop_new_time + + +- markdown: | + or several axes: + + +- code: | + pop_new = pop.reindex({'country': 'country=Luxembourg,Belgium,France,Germany', + 'time': 'time=2018..2014'}, fill_value=0) + pop_new + + +- markdown: | + See [reindex](../_generated/larray.LArray.reindex.rst#larray.LArray.reindex) for more details and examples. + + +- markdown: | + Another way to insert new labels is to use the ``insert`` method: + + +- code: | + # insert a new country before 'France' with all values set to 0 + pop_new_country = pop.insert(0, before='France', label='Luxembourg') + # or equivalently + pop_new_country = pop.insert(0, after='Belgium', label='Luxembourg') + + pop_new_country + + +- markdown: | + See [insert](../_generated/larray.LArray.insert.rst#larray.LArray.insert) for more details and examples. + + +- markdown: | + ## Sorting + + + - [sort_axes](../_generated/larray.LArray.sort_axes.rst#larray.LArray.sort_axes): sort the labels of an axis. + - [labelsofsorted](../_generated/larray.LArray.labelsofsorted.rst#larray.LArray.labelsofsorted): give labels which would sort an axis. + - [sort_values](../_generated/larray.LArray.sort_values.rst#larray.LArray.sort_values): sort axes according to values + + +- code: | + # get a copy of the 'pop_benelux' array + pop_benelux = demo_eurostat.pop_benelux.copy() + pop_benelux + + +- markdown: | + Sort an axis (alphabetically if labels are strings) + + +- code: | + pop_sorted = pop_benelux.sort_axes('gender') + pop_sorted + + +- markdown: | + Give labels which would sort the axis + + +- code: | + pop_benelux.labelsofsorted('country') + + +- markdown: | + Sort according to values + + +- code: | + pop_sorted = pop_benelux.sort_values(('Male', 2017)) + pop_sorted - id: 12 - markdown: | ## Combining arrays - ### Append/Prepend + LArray offers several methods to combine arrays: - Append/prepend one element to an axis of an array + - [insert](../_generated/larray.LArray.insert.rst#larray.LArray.insert): inserts an array in another array along an axis + - [append](../_generated/larray.LArray.append.rst#larray.LArray.append): adds an array at the end of an axis. + - [prepend](../_generated/larray.LArray.prepend.rst#larray.LArray.prepend): adds an array at the beginning of an axis. + - [extend](../_generated/larray.LArray.extend.rst#larray.LArray.extend): extends an array along an axis. + - [stack](../_generated/larray.stack.rst#larray.stack): combines several arrays along an axis. + + +- markdown: | + ### Insert - code: | - pop = load_example_data('demography').pop[2016, 'BruCap', 90:95] + other_countries = zeros((Axis('country=Luxembourg,Netherlands'), pop.gender, pop.time), dtype=int) - # imagine that you have now acces to the number of non-EU foreigners - data = [[25, 54], [15, 33], [12, 28], [11, 37], [5, 21], [7, 19]] - pop_non_eu = LArray(data, pop['FO'].axes) + # insert new countries before 'France' + pop_new_countries = pop.insert(other_countries, before='France') + pop_new_countries + + +- markdown: | + See [insert](../_generated/larray.LArray.insert.rst#larray.LArray.insert) for more details and examples. + + +- markdown: | + ### Append - # you can do something like this - pop = pop.append('nat', pop_non_eu, 'NEU') - pop + Append one element to an axis of an array: - id: 13 - code: | - # you can also add something at the start of an axis - pop = pop.prepend('sex', pop.sum('sex'), 'B') - pop + # append data for 'Luxembourg' + pop_new = pop.append('country', pop_benelux['Luxembourg'], 'Luxembourg') + pop_new - id: 14 - markdown: | - The value being appended/prepended can have missing (or even extra) axes as long as common axes are compatible + The value being appended can have missing (or even extra) axes as long as common axes are compatible: - code: | - aliens = zeros(pop.axes['sex']) - aliens + pop_lux = LArray([-1, 1], pop.gender) + pop_lux - id: 15 - code: | - pop = pop.append('nat', aliens, 'AL') - pop + pop_new = pop.append('country', pop_lux, 'Luxembourg') + pop_new + + +- markdown: | + See [append](../_generated/larray.LArray.append.rst#larray.LArray.append) for more details and examples. + + +- markdown: | + ### Prepend + + Prepend one element to an axis of an array: + + +- code: | + # append data for 'Luxembourg' + pop_new = pop.prepend('country', pop_benelux['Luxembourg'], 'Luxembourg') + pop_new + + +- markdown: | + See [prepend](../_generated/larray.LArray.prepend.rst#larray.LArray.prepend) for more details and examples. - id: 16 - markdown: | ### Extend @@ -178,14 +382,13 @@ cells: - code: | - _pop = load_example_data('demography').pop - pop = _pop[2016, 'BruCap', 90:95] - pop_next = _pop[2016, 'BruCap', 96:100] - - # concatenate along age axis - pop.extend('age', pop_next) + pop_extended = pop.extend('country', pop_benelux[['Luxembourg', 'Netherlands']]) + pop_extended + + +- markdown: | + See [extend](../_generated/larray.LArray.extend.rst#larray.LArray.extend) for more details and examples. - id: 17 - markdown: | ### Stack @@ -194,49 +397,62 @@ cells: - code: | - # imagine you have loaded data for each nationality in different arrays (e.g. loaded from different Excel sheets) - pop_be, pop_fo = pop['BE'], pop['FO'] - - # first way to stack them - nat = Axis('nat=BE,FO,NEU') - pop = stack([pop_be, pop_fo, pop_non_eu], nat) + # imagine you have loaded data for each country in different arrays + # (e.g. loaded from different Excel sheets) + pop_be = pop['Belgium'] + pop_fr = pop['France'] + pop_de = pop['Germany'] - # second way - pop = stack([('BE', pop_be), ('FO', pop_fo), ('NEU', pop_non_eu)], 'nat') - - pop + pop_stacked = stack({'Belgium': pop_be, 'France': pop_fr, 'Germany': pop_de}, 'country') + pop_stacked - id: 18 - markdown: | - ## Sorting + See [stack](../_generated/larray.stack.rst#larray.stack) for more details and examples. + + +- markdown: | + ## Aligning Arrays - Sort an axis (alphabetically if labels are strings) + The ``align`` method align two arrays on their axes with a specified join method. + In other words, it ensure all common axes are compatible. - code: | - pop_sorted = pop.sort_axes('nat') - pop_sorted + # get a copy of the 'births' array + births = demo_eurostat.births.copy() + + # align the two arrays with the 'inner' join method + pop_aligned, births_aligned = pop_benelux.align(births, join='inner') - id: 19 -- markdown: | - Give labels which would sort the axis +- code: | + print('pop_benelux before align:') + print(pop_benelux) + print() + print('pop_benelux after align:') + print(pop_aligned) - code: | - pop_sorted.labelsofsorted('sex') + print('births before align:') + print(births) + print() + print('births after align:') + print(births_aligned) - id: 20 - markdown: | - Sort according to values + Aligned arrays can then be used in arithmetic operations: - code: | - pop_sorted.sort_values((90, 'F')) + pop_aligned - births_aligned + + +- markdown: | + See [align](../_generated/larray.LArray.align.rst#larray.LArray.align) for more details and examples. - id: 21 # The lines below here may be deleted if you do not need them. # --------------------------------------------------------------------------- @@ -255,22 +471,10 @@ metadata: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.4 + version: 3.7.3 livereveal: autolaunch: false scroll: true nbformat: 4 nbformat_minor: 2 -# --------------------------------------------------------------------------- -data: - [{execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, - outputs: []}, {execution_count: null, outputs: []}, {execution_count: null, outputs: []}, - {execution_count: null, outputs: []}, {execution_count: null, outputs: []}] - diff --git a/doc/source/tutorial/tutorial_transforming.ipynb b/doc/source/tutorial/tutorial_transforming.ipynb index 7f1357185..1632f2d78 100644 --- a/doc/source/tutorial/tutorial_transforming.ipynb +++ b/doc/source/tutorial/tutorial_transforming.ipynb @@ -54,10 +54,13 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Manipulating axes\n" + "# load the 'demography_eurostat' dataset\n", + "demo_eurostat = load_example_data('demography_eurostat')" ] }, { @@ -66,18 +69,38 @@ "metadata": {}, "outputs": [], "source": [ - "# let's start with\n", - "pop = load_example_data('demography').pop[2016, 'BruCap', 90:95]\n", + "# get a copy of the 'pop' array from the 'demography_eurostat' dataset\n", + "pop = demo_eurostat.pop.copy()\n", "pop" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Manipulating axes\n", + "\n", + "\n", + "LArray offers several methods to manipulate the axes and labels of an array:\n", + "\n", + "- [set_labels](../_generated/larray.LArray.set_labels.rst#larray.LArray.set_labels): to replace all or some labels of one or several axes.\n", + "- [rename](../_generated/larray.LArray.rename.rst#larray.LArray.rename): to replace one or several axis names.\n", + "- [set_axes](../_generated/larray.LArray.set_axes.rst#larray.LArray.set_axes): to replace one or several axes.\n", + "- [transpose](../_generated/larray.LArray.transpose.rst#larray.LArray.transpose): to modify the order of axes.\n", + "- [drop](../_generated/larray.LArray.drop.rst#larray.LArray.drop): to remove one or several labels.\n", + "- [combine_axes](../_generated/larray.LArray.combine_axes.rst#larray.LArray.combine_axes): to combine axes.\n", + "- [split_axes](../_generated/larray.LArray.split_axes.rst#larray.LArray.split_axes): to split one or several axes by splitting their labels and names.\n", + "- [reindex](../_generated/larray.LArray.reindex.rst#larray.LArray.reindex): to reorder, add and remove labels of one or several axes.\n", + "- [insert](../_generated/larray.LArray.insert.rst#larray.LArray.insert): to insert a label at a given position.\n" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Relabeling\n", "\n", - "Replace all labels of one axis\n" + "Replace some labels of an axis:\n" ] }, { @@ -86,8 +109,8 @@ "metadata": {}, "outputs": [], "source": [ - "# returns a copy by default\n", - "pop_new_labels = pop.set_labels('sex', ['Men', 'Women'])\n", + "# replace all labels of the 'gender' axis by passing the list of all new labels\n", + "pop_new_labels = pop.set_labels('gender', ['Men', 'Women'])\n", "pop_new_labels" ] }, @@ -97,8 +120,27 @@ "metadata": {}, "outputs": [], "source": [ - "# inplace flag avoids to create a copy\n", - "pop.set_labels('sex', ['M', 'F'], inplace=True)" + "# set all labels of the 'country' axis to uppercase by passing the function str.upper()\n", + "pop_new_labels = pop.set_labels('country', str.upper)\n", + "pop_new_labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# replace only one label of the 'gender' axis by passing a dict\n", + "pop_new_labels = pop.set_labels('gender', {'Male': 'Men'})\n", + "pop_new_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [set_labels](../_generated/larray.LArray.set_labels.rst#larray.LArray.set_labels) for more details and examples." ] }, { @@ -107,7 +149,7 @@ "source": [ "### Renaming axes\n", "\n", - "Rename one axis\n" + "Rename one axis:\n" ] }, { @@ -116,7 +158,16 @@ "metadata": {}, "outputs": [], "source": [ - "pop.info" + "# 'rename' returns a copy of the array\n", + "pop_new_names = pop.rename('time', 'year')\n", + "pop_new_names" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rename several axes at once:\n" ] }, { @@ -125,16 +176,24 @@ "metadata": {}, "outputs": [], "source": [ - "# 'rename' returns a copy of the array\n", - "pop2 = pop.rename('sex', 'gender')\n", - "pop2" + "pop_new_names = pop.rename({'gender': 'sex', 'time': 'year'})\n", + "pop_new_names" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [rename](../_generated/larray.LArray.rename.rst#larray.LArray.rename) for more details and examples." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Rename several axes at once\n" + "### Replacing Axes\n", + "\n", + "Replace one axis:" ] }, { @@ -143,9 +202,27 @@ "metadata": {}, "outputs": [], "source": [ - "# No x. here because sex and nat are keywords and not actual axes\n", - "pop2 = pop.rename(sex='gender', nat='nationality')\n", - "pop2" + "new_gender = Axis('sex=Men,Women')\n", + "pop_new_axis = pop.set_axes('gender', new_gender)\n", + "pop_new_axis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Replace several axes at once:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_country = Axis('country_codes=BE,FR,DE') \n", + "pop_new_axes = pop.set_axes({'country': new_country, 'gender': new_gender})\n", + "pop_new_axes" ] }, { @@ -166,7 +243,7 @@ "metadata": {}, "outputs": [], "source": [ - "# starting order : age, sex, nat\n", + "# starting order : country, gender, time\n", "pop" ] }, @@ -176,11 +253,13 @@ "metadata": {}, "outputs": [], "source": [ - "# no argument --> reverse axes\n", - "pop.transpose()\n", + "# no argument --> reverse all axes\n", + "pop_transposed = pop.transpose()\n", "\n", "# .T is a shortcut for .transpose()\n", - "pop.T" + "pop_transposed = pop.T\n", + "\n", + "pop_transposed" ] }, { @@ -190,7 +269,209 @@ "outputs": [], "source": [ "# reorder according to list\n", - "pop.transpose('age', 'nat', 'sex')" + "pop_transposed = pop.transpose('gender', 'country', 'time')\n", + "pop_transposed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# move 'time' axis at first place\n", + "# not mentioned axes come after those which are mentioned (and keep their relative order)\n", + "pop_transposed = pop.transpose('time')\n", + "pop_transposed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# move 'gender' axis at last place\n", + "# not mentioned axes come before those which are mentioned (and keep their relative order)\n", + "pop_transposed = pop.transpose(..., 'gender')\n", + "pop_transposed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [transpose](../_generated/larray.LArray.transpose.rst#larray.LArray.transpose) for more details and examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dropping Labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop_labels_dropped = pop.drop([2014, 2016])\n", + "pop_labels_dropped" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [drop](../_generated/larray.LArray.drop.rst#larray.LArray.drop) for more details and examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Combine And Split Axes\n", + "\n", + "Combine two axes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop_combined_axes = pop.combine_axes(('country', 'gender'))\n", + "pop_combined_axes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split an axis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop_split_axes = pop_combined_axes.split_axes('country_gender')\n", + "pop_split_axes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [combine_axes](../_generated/larray.LArray.combine_axes.rst#larray.LArray.combine_axes) and [split_axes](../_generated/larray.LArray.split_axes.rst#larray.LArray.split_axes) for more details and examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reordering, adding and removing labels\n", + "\n", + "The ``reindex`` method allows to reorder, add and remove labels along one axis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reverse years + remove 2013 + add 2018 + copy data for 2017 to 2018\n", + "pop_new_time = pop.reindex('time', '2018..2014', fill_value=pop[2017])\n", + "pop_new_time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "or several axes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop_new = pop.reindex({'country': 'country=Luxembourg,Belgium,France,Germany', \n", + " 'time': 'time=2018..2014'}, fill_value=0)\n", + "pop_new" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [reindex](../_generated/larray.LArray.reindex.rst#larray.LArray.reindex) for more details and examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another way to insert new labels is to use the ``insert`` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# insert a new country before 'France' with all values set to 0\n", + "pop_new_country = pop.insert(0, before='France', label='Luxembourg')\n", + "# or equivalently\n", + "pop_new_country = pop.insert(0, after='Belgium', label='Luxembourg')\n", + "\n", + "pop_new_country" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [insert](../_generated/larray.LArray.insert.rst#larray.LArray.insert) for more details and examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sorting\n", + "\n", + "\n", + "- [sort_axes](../_generated/larray.LArray.sort_axes.rst#larray.LArray.sort_axes): sort the labels of an axis.\n", + "- [labelsofsorted](../_generated/larray.LArray.labelsofsorted.rst#larray.LArray.labelsofsorted): give labels which would sort an axis. \n", + "- [sort_values](../_generated/larray.LArray.sort_values.rst#larray.LArray.sort_values): sort axes according to values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get a copy of the 'pop_benelux' array\n", + "pop_benelux = demo_eurostat.pop_benelux.copy()\n", + "pop_benelux" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sort an axis (alphabetically if labels are strings)" ] }, { @@ -199,8 +480,41 @@ "metadata": {}, "outputs": [], "source": [ - "# axes not mentioned come after those which are mentioned (and keep their relative order)\n", - "pop.transpose('sex')" + "pop_sorted = pop_benelux.sort_axes('gender')\n", + "pop_sorted" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Give labels which would sort the axis\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop_benelux.labelsofsorted('country')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sort according to values\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop_sorted = pop_benelux.sort_values(('Male', 2017))\n", + "pop_sorted" ] }, { @@ -209,9 +523,20 @@ "source": [ "## Combining arrays\n", "\n", - "### Append/Prepend\n", + "LArray offers several methods to combine arrays:\n", "\n", - "Append/prepend one element to an axis of an array\n" + "- [insert](../_generated/larray.LArray.insert.rst#larray.LArray.insert): inserts an array in another array along an axis\n", + "- [append](../_generated/larray.LArray.append.rst#larray.LArray.append): adds an array at the end of an axis.\n", + "- [prepend](../_generated/larray.LArray.prepend.rst#larray.LArray.prepend): adds an array at the beginning of an axis.\n", + "- [extend](../_generated/larray.LArray.extend.rst#larray.LArray.extend): extends an array along an axis.\n", + "- [stack](../_generated/larray.stack.rst#larray.stack): combines several arrays along an axis.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Insert" ] }, { @@ -220,15 +545,27 @@ "metadata": {}, "outputs": [], "source": [ - "pop = load_example_data('demography').pop[2016, 'BruCap', 90:95]\n", + "other_countries = zeros((Axis('country=Luxembourg,Netherlands'), pop.gender, pop.time), dtype=int)\n", "\n", - "# imagine that you have now acces to the number of non-EU foreigners\n", - "data = [[25, 54], [15, 33], [12, 28], [11, 37], [5, 21], [7, 19]]\n", - "pop_non_eu = LArray(data, pop['FO'].axes)\n", + "# insert new countries before 'France'\n", + "pop_new_countries = pop.insert(other_countries, before='France')\n", + "pop_new_countries" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [insert](../_generated/larray.LArray.insert.rst#larray.LArray.insert) for more details and examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Append\n", "\n", - "# you can do something like this\n", - "pop = pop.append('nat', pop_non_eu, 'NEU')\n", - "pop" + "Append one element to an axis of an array:" ] }, { @@ -237,16 +574,16 @@ "metadata": {}, "outputs": [], "source": [ - "# you can also add something at the start of an axis\n", - "pop = pop.prepend('sex', pop.sum('sex'), 'B')\n", - "pop" + "# append data for 'Luxembourg'\n", + "pop_new = pop.append('country', pop_benelux['Luxembourg'], 'Luxembourg')\n", + "pop_new" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The value being appended/prepended can have missing (or even extra) axes as long as common axes are compatible\n" + "The value being appended can have missing (or even extra) axes as long as common axes are compatible:\n" ] }, { @@ -255,8 +592,8 @@ "metadata": {}, "outputs": [], "source": [ - "aliens = zeros(pop.axes['sex'])\n", - "aliens" + "pop_lux = LArray([-1, 1], pop.gender)\n", + "pop_lux" ] }, { @@ -265,8 +602,42 @@ "metadata": {}, "outputs": [], "source": [ - "pop = pop.append('nat', aliens, 'AL')\n", - "pop" + "pop_new = pop.append('country', pop_lux, 'Luxembourg')\n", + "pop_new" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [append](../_generated/larray.LArray.append.rst#larray.LArray.append) for more details and examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepend\n", + "\n", + "Prepend one element to an axis of an array:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# append data for 'Luxembourg'\n", + "pop_new = pop.prepend('country', pop_benelux['Luxembourg'], 'Luxembourg')\n", + "pop_new" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [prepend](../_generated/larray.LArray.prepend.rst#larray.LArray.prepend) for more details and examples." ] }, { @@ -284,12 +655,15 @@ "metadata": {}, "outputs": [], "source": [ - "_pop = load_example_data('demography').pop\n", - "pop = _pop[2016, 'BruCap', 90:95]\n", - "pop_next = _pop[2016, 'BruCap', 96:100]\n", - "\n", - "# concatenate along age axis\n", - "pop.extend('age', pop_next)" + "pop_extended = pop.extend('country', pop_benelux[['Luxembourg', 'Netherlands']])\n", + "pop_extended" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [extend](../_generated/larray.LArray.extend.rst#larray.LArray.extend) for more details and examples." ] }, { @@ -307,26 +681,31 @@ "metadata": {}, "outputs": [], "source": [ - "# imagine you have loaded data for each nationality in different arrays (e.g. loaded from different Excel sheets)\n", - "pop_be, pop_fo = pop['BE'], pop['FO']\n", - "\n", - "# first way to stack them\n", - "nat = Axis('nat=BE,FO,NEU')\n", - "pop = stack([pop_be, pop_fo, pop_non_eu], nat)\n", - "\n", - "# second way\n", - "pop = stack([('BE', pop_be), ('FO', pop_fo), ('NEU', pop_non_eu)], 'nat')\n", + "# imagine you have loaded data for each country in different arrays \n", + "# (e.g. loaded from different Excel sheets)\n", + "pop_be = pop['Belgium']\n", + "pop_fr = pop['France']\n", + "pop_de = pop['Germany']\n", "\n", - "pop" + "pop_stacked = stack({'Belgium': pop_be, 'France': pop_fr, 'Germany': pop_de}, 'country')\n", + "pop_stacked" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Sorting\n", + "See [stack](../_generated/larray.stack.rst#larray.stack) for more details and examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aligning Arrays\n", "\n", - "Sort an axis (alphabetically if labels are strings)\n" + "The ``align`` method align two arrays on their axes with a specified join method.\n", + "In other words, it ensure all common axes are compatible." ] }, { @@ -335,15 +714,24 @@ "metadata": {}, "outputs": [], "source": [ - "pop_sorted = pop.sort_axes('nat')\n", - "pop_sorted" + "# get a copy of the 'births' array\n", + "births = demo_eurostat.births.copy()\n", + "\n", + "# align the two arrays with the 'inner' join method\n", + "pop_aligned, births_aligned = pop_benelux.align(births, join='inner')" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Give labels which would sort the axis\n" + "print('pop_benelux before align:')\n", + "print(pop_benelux)\n", + "print()\n", + "print('pop_benelux after align:')\n", + "print(pop_aligned)" ] }, { @@ -352,14 +740,18 @@ "metadata": {}, "outputs": [], "source": [ - "pop_sorted.labelsofsorted('sex')" + "print('births before align:')\n", + "print(births)\n", + "print()\n", + "print('births after align:')\n", + "print(births_aligned)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Sort according to values\n" + "Aligned arrays can then be used in arithmetic operations:" ] }, { @@ -368,7 +760,14 @@ "metadata": {}, "outputs": [], "source": [ - "pop_sorted.sort_values((90, 'F'))" + "pop_aligned - births_aligned" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [align](../_generated/larray.LArray.align.rst#larray.LArray.align) for more details and examples." ] } ], @@ -389,7 +788,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.7.3" }, "livereveal": { "autolaunch": false, diff --git a/larray/core/array.py b/larray/core/array.py index 17612fa91..39a4ac7c8 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -3789,8 +3789,7 @@ def percent(self, *axes): BE 40.0 60.0 FO 20.0 80.0 """ - # dividing by self.sum(*axes) * 0.01 would be faster in many cases but I suspect it loose more precision. - return self * 100 / self.sum(*axes) + return self * 100.0 / self.sum(*axes) # aggregate method decorator def _decorate_agg_method(npfunc, nanfunc=None, commutative=False, by_agg=False, extra_kwargs=[], diff --git a/larray/example.py b/larray/example.py index d9f9208d5..578757b54 100644 --- a/larray/example.py +++ b/larray/example.py @@ -5,10 +5,9 @@ _TEST_DIR = os.path.join(os.path.dirname(__file__), 'tests') EXAMPLE_FILES_DIR = os.path.join(_TEST_DIR, 'data') -# TODO : replace 'demography.h5' by 'population_session.h5' and remove 'demo' ? AVAILABLE_EXAMPLE_DATA = { - 'demo': os.path.join(EXAMPLE_FILES_DIR, 'population_session.h5'), - 'demography': os.path.join(EXAMPLE_FILES_DIR, 'demography.h5') + 'demography': os.path.join(EXAMPLE_FILES_DIR, 'demography.h5'), + 'demography_eurostat': os.path.join(EXAMPLE_FILES_DIR, 'demography_eurostat.h5') } AVAILABLE_EXAMPLE_FILES = os.listdir(EXAMPLE_FILES_DIR) @@ -43,6 +42,7 @@ def get_example_filepath(fname): return fpath +# TODO : replace # doctest: +SKIP by # doctest: +NORMALIZE_WHITESPACE once Python 2 has been dropped def load_example_data(name): r"""Load arrays used in the tutorial so that all examples in it can be reproduced. @@ -52,29 +52,37 @@ def load_example_data(name): Example data to load. Available example datasets are: - demography + - demography_eurostat Returns ------- Session - Session containing one or several arrays + Session containing one or several arrays. Examples -------- >>> demo = load_example_data('demography') - >>> demo.pop.info # doctest: +SKIP - 26 x 3 x 121 x 2 x 2 - time [26]: 1991 1992 1993 ... 2014 2015 2016 - geo [3]: 'BruCap' 'Fla' 'Wal' - age [121]: 0 1 2 ... 118 119 120 - sex [2]: 'M' 'F' - nat [2]: 'BE' 'FO' - >>> demo.qx.info # doctest: +SKIP - 26 x 3 x 121 x 2 x 2 - time [26]: 1991 1992 1993 ... 2014 2015 2016 - geo [3]: 'BruCap' 'Fla' 'Wal' - age [121]: 0 1 2 ... 118 119 120 - sex [2]: 'M' 'F' - nat [2]: 'BE' 'FO' + >>> print(demo.summary()) # doctest: +NORMALIZE_WHITESPACE + hh: time, geo, hh_type (26 x 3 x 7) [int64] + pop: time, geo, age, sex, nat (26 x 3 x 121 x 2 x 2) [int64] + qx: time, geo, age, sex, nat (26 x 3 x 121 x 2 x 2) [float64] + >>> demo = load_example_data('demography_eurostat') # doctest: +SKIP + >>> print(demo.summary()) # doctest: +SKIP + Metadata: + title: Demographic datasets for a small selection of countries in Europe + source: demo_jpan, demo_fasec, demo_magec and migr_imm1ctz tables from Eurostat + gender: gender ['Male' 'Female'] (2) + country: country ['Belgium' 'France' 'Germany'] (3) + country_benelux: country_benelux ['Belgium' 'Luxembourg' 'Netherlands'] (3) + citizenship: citizenship ['Belgium' 'Luxembourg' 'Netherlands'] (3) + time: time [2013 2014 2015 2016 2017] (5) + even_years: time[2014 2016] >> even_years (2) + odd_years: time[2013 2015 2017] >> odd_years (3) + births: country, gender, time (3 x 2 x 5) [int32] + deaths: country, gender, time (3 x 2 x 5) [int32] + immigration: country, citizenship, gender, time (3 x 3 x 2 x 5) [int32] + pop: country, gender, time (3 x 2 x 5) [int32] + pop_benelux: country, gender, time (3 x 2 x 5) [int32] """ if name is None: name = 'demography' diff --git a/larray/inout/csv.py b/larray/inout/csv.py index 440771afe..87318f15a 100644 --- a/larray/inout/csv.py +++ b/larray/inout/csv.py @@ -78,8 +78,8 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse country,gender\time,2013,2014,2015 Belgium,Male,5472856,5493792,5524068 Belgium,Female,5665118,5687048,5713206 - France,Male,31772665,31936596,32175328 - France,Female,33827685,34005671,34280951 + France,Male,31772665,32045129,32174258 + France,Female,33827685,34120851,34283895 Germany,Male,39380976,39556923,39835457 Germany,Female,41142770,41210540,41362080 @@ -93,8 +93,8 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse country gender\time 2013 2014 2015 Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 - France Male 31772665 31936596 32175328 - France Female 33827685 34005671 34280951 + France Male 31772665 32045129 32174258 + France Female 33827685 34120851 34283895 Germany Male 39380976 39556923 39835457 Germany Female 41142770 41210540 41362080 @@ -108,7 +108,7 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse country,gender\time,2013,2014,2015 Belgium,Male,5472856,5493792,5524068 Belgium,Female,5665118,5687048,5713206 - France,Female,33827685,34005671,34280951 + France,Female,33827685,34120851,34283895 Germany,Male,39380976,39556923,39835457 >>> # by default, cells associated with missing label combinations are filled with NaN. >>> # In that case, an int array is converted to a float array. @@ -117,7 +117,7 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse Belgium Male 5472856.0 5493792.0 5524068.0 Belgium Female 5665118.0 5687048.0 5713206.0 France Male nan nan nan - France Female 33827685.0 34005671.0 34280951.0 + France Female 33827685.0 34120851.0 34283895.0 Germany Male 39380976.0 39556923.0 39835457.0 Germany Female nan nan nan >>> # using argument 'fill_value', you can choose which value to use to fill missing cells. @@ -126,7 +126,7 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 France Male 0 0 0 - France Female 33827685 34005671 34280951 + France Female 33827685 34120851 34283895 Germany Male 39380976 39556923 39835457 Germany Female 0 0 0 @@ -140,8 +140,8 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse country,gender,2013,2014,2015 Belgium,Male,5472856,5493792,5524068 Belgium,Female,5665118,5687048,5713206 - France,Male,31772665,31936596,32175328 - France,Female,33827685,34005671,34280951 + France,Male,31772665,32045129,32174258 + France,Female,33827685,34120851,34283895 Germany,Male,39380976,39556923,39835457 Germany,Female,41142770,41210540,41362080 >>> # read the array stored in the CSV file as is @@ -177,13 +177,13 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse Belgium,2014,11180840 Belgium,2015,11237274 France,2013,65600350 - France,2014,65942267 - France,2015,66456279 + France,2014,66165980 + France,2015,66458153 >>> # to read arrays stored in 'narrow' format, you must pass wide=False to read_csv >>> read_csv(fname, wide=False) country\time 2013 2014 2015 Belgium 11137974 11180840 11237274 - France 65600350 65942267 66456279 + France 65600350 66165980 66458153 """ if not np.isnan(na): fill_value = na diff --git a/larray/inout/excel.py b/larray/inout/excel.py index d7d7cfff8..4e76bb1c6 100644 --- a/larray/inout/excel.py +++ b/larray/inout/excel.py @@ -84,8 +84,8 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan, country gender\time 2013 2014 2015 Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 - France Male 31772665 31936596 32175328 - France Female 33827685 34005671 34280951 + France Male 31772665 32045129 32174258 + France Female 33827685 34120851 34283895 Germany Male 39380976 39556923 39835457 Germany Female 41142770 41210540 41362080 @@ -109,7 +109,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan, country gender\time 2013 2014 2015 Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 - France Female 33827685 34005671 34280951 + France Female 33827685 34120851 34283895 Germany Male 39380976 39556923 39835457 By default, cells associated with missing label combinations are filled with NaN. In that case, an int array @@ -120,7 +120,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan, Belgium Male 5472856.0 5493792.0 5524068.0 Belgium Female 5665118.0 5687048.0 5713206.0 France Male nan nan nan - France Female 33827685.0 34005671.0 34280951.0 + France Female 33827685.0 34120851.0 34283895.0 Germany Male 39380976.0 39556923.0 39835457.0 Germany Female nan nan nan @@ -131,7 +131,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan, Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 France Male 0 0 0 - France Female 33827685 34005671 34280951 + France Female 33827685 34120851 34283895 Germany Male 39380976 39556923 39835457 Germany Female 0 0 0 @@ -142,8 +142,8 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan, country gender 2013 2014 2015 Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 - France Male 31772665 31936596 32175328 - France Female 33827685 34005671 34280951 + France Male 31772665 32045129 32174258 + France Female 33827685 34120851 34283895 Germany Male 39380976 39556923 39835457 Germany Female 41142770 41210540 41362080 @@ -177,14 +177,14 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan, Belgium 2014 11180840 Belgium 2015 11237274 France 2013 65600350 - France 2014 65942267 - France 2015 66456279 + France 2014 66165980 + France 2015 66458153 >>> # to read arrays stored in 'narrow' format, you must pass wide=False to read_excel >>> read_excel(fname, 'pop_narrow_format', wide=False) country\time 2013 2014 2015 Belgium 11137974 11180840 11237274 - France 65600350 65942267 66456279 + France 65600350 66165980 66458153 Extract array from a given range (xlwings only) diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py index 92bbc7516..25d0df0eb 100644 --- a/larray/inout/hdf.py +++ b/larray/inout/hdf.py @@ -57,8 +57,8 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s country gender\time 2013 2014 2015 Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 - France Male 31772665 31936596 32175328 - France Female 33827685 34005671 34280951 + France Male 31772665 32045129 32174258 + France Female 33827685 34120851 34283895 Germany Male 39380976 39556923 39835457 Germany Female 41142770 41210540 41362080 """ diff --git a/larray/inout/xw_reporting.py b/larray/inout/xw_reporting.py index 243c6048c..424cc124d 100644 --- a/larray/inout/xw_reporting.py +++ b/larray/inout/xw_reporting.py @@ -79,7 +79,7 @@ def template(self): Examples -------- - >>> demo = load_example_data('demo') + >>> demo = load_example_data('demography_eurostat') Passing the name of the template (only if a template directory has been set) @@ -245,7 +245,7 @@ def add_graph(self, data, title=None, template=None, width=None, height=None): Examples -------- - >>> demo = load_example_data('demo') + >>> demo = load_example_data('demography_eurostat') >>> report = ExcelReport(EXAMPLE_EXCEL_TEMPLATES_DIR) >>> sheet_be = report.new_sheet('Belgium') @@ -300,7 +300,7 @@ def add_graphs(self, array_per_title, axis_per_loop_variable, template=None, wid Examples -------- - >>> demo = load_example_data('demo') + >>> demo = load_example_data('demography_eurostat') >>> report = ExcelReport(EXAMPLE_EXCEL_TEMPLATES_DIR) >>> sheet_pop = report.new_sheet('Population') @@ -353,7 +353,7 @@ class AbstractExcelReport(AbstractReportItem): Examples -------- - >>> demo = load_example_data('demo') + >>> demo = load_example_data('demography_eurostat') >>> report = ExcelReport(EXAMPLE_EXCEL_TEMPLATES_DIR) Set a new destination sheet @@ -428,7 +428,7 @@ def new_sheet(self, sheet_name): Examples -------- - >>> demo = load_example_data('demo') + >>> demo = load_example_data('demography_eurostat') >>> report = ExcelReport(EXAMPLE_EXCEL_TEMPLATES_DIR) >>> # prepare new output sheet named 'Belgium' @@ -471,7 +471,7 @@ def to_excel(self, filepath, data_sheet_name='__data__', overwrite=True): Examples -------- - >>> demo = load_example_data('demo') + >>> demo = load_example_data('demography_eurostat') >>> report = ExcelReport(EXAMPLE_EXCEL_TEMPLATES_DIR) >>> report.template = 'Line_Marker' diff --git a/larray/tests/data/births_and_deaths.xlsx b/larray/tests/data/births_and_deaths.xlsx index 507815dc6..475156847 100644 Binary files a/larray/tests/data/births_and_deaths.xlsx and b/larray/tests/data/births_and_deaths.xlsx differ diff --git a/larray/tests/data/demography_eurostat.h5 b/larray/tests/data/demography_eurostat.h5 new file mode 100644 index 000000000..0ebe38fae Binary files /dev/null and b/larray/tests/data/demography_eurostat.h5 differ diff --git a/larray/tests/data/demography_eurostat.xlsx b/larray/tests/data/demography_eurostat.xlsx new file mode 100644 index 000000000..d7da56f1d Binary files /dev/null and b/larray/tests/data/demography_eurostat.xlsx differ diff --git a/larray/tests/data/demography_eurostat/__axes__.csv b/larray/tests/data/demography_eurostat/__axes__.csv new file mode 100644 index 000000000..b338005b0 --- /dev/null +++ b/larray/tests/data/demography_eurostat/__axes__.csv @@ -0,0 +1,6 @@ +country,country,citizenship,gender,time +Belgium,Belgium,Belgium,Male,2013 +France,Luxembourg,Luxembourg,Female,2014 +Germany,Netherlands,Netherlands,,2015 +,,,,2016 +,,,,2017 diff --git a/larray/tests/data/population_session/__groups__.csv b/larray/tests/data/demography_eurostat/__groups__.csv similarity index 71% rename from larray/tests/data/population_session/__groups__.csv rename to larray/tests/data/demography_eurostat/__groups__.csv index a25e717f9..8785c4f5c 100644 --- a/larray/tests/data/population_session/__groups__.csv +++ b/larray/tests/data/demography_eurostat/__groups__.csv @@ -1,3 +1,4 @@ even_years@time,odd_years@time 2014,2013 -,2015 +2016,2015 +,2017 diff --git a/larray/tests/data/demography_eurostat/__metadata__.csv b/larray/tests/data/demography_eurostat/__metadata__.csv new file mode 100644 index 000000000..67b888532 --- /dev/null +++ b/larray/tests/data/demography_eurostat/__metadata__.csv @@ -0,0 +1,3 @@ +metadata, +title,Demographic datasets for a small selection of countries in Europe +source,"demo_jpan, demo_fasec, demo_magec and migr_imm1ctz tables from Eurostat" diff --git a/larray/tests/data/demography_eurostat/births.csv b/larray/tests/data/demography_eurostat/births.csv new file mode 100644 index 000000000..94bf8ea3a --- /dev/null +++ b/larray/tests/data/demography_eurostat/births.csv @@ -0,0 +1,7 @@ +country,gender\time,2013,2014,2015,2016,2017 +Belgium,Male,64371,64173,62561,62428,61179 +Belgium,Female,61235,60841,59713,59468,58511 +France,Male,415762,418721,409145,401388,394058 +France,Female,396581,400607,390526,382937,375987 +Germany,Male,349820,366835,378478,405587,402517 +Germany,Female,332249,348092,359097,386554,382384 diff --git a/larray/tests/data/demography_eurostat/deaths.csv b/larray/tests/data/demography_eurostat/deaths.csv new file mode 100644 index 000000000..26471af7c --- /dev/null +++ b/larray/tests/data/demography_eurostat/deaths.csv @@ -0,0 +1,7 @@ +country,gender\time,2013,2014,2015,2016,2017 +Belgium,Male,53908,51579,53631,53326,53825 +Belgium,Female,55426,53176,56910,54771,55841 +France,Male,287410,282381,297028,297020,301020 +France,Female,281955,277054,296779,296985,305390 +Germany,Male,429645,422225,449512,448305,457761 +Germany,Female,464180,446131,475688,462597,474511 diff --git a/larray/tests/data/demography_eurostat/immigration.csv b/larray/tests/data/demography_eurostat/immigration.csv new file mode 100644 index 000000000..4dd711e10 --- /dev/null +++ b/larray/tests/data/demography_eurostat/immigration.csv @@ -0,0 +1,19 @@ +country,citizenship,gender\time,2013,2014,2015,2016,2017 +Belgium,Belgium,Male,8822,10512,11378,11055,11082 +Belgium,Belgium,Female,5727,6301,6486,6560,6454 +Belgium,Luxembourg,Male,102,117,105,130,110 +Belgium,Luxembourg,Female,117,123,114,108,118 +Belgium,Netherlands,Male,4185,4222,4183,4199,4138 +Belgium,Netherlands,Female,3737,3844,3942,3664,3632 +Luxembourg,Belgium,Male,896,937,880,762,781 +Luxembourg,Belgium,Female,574,655,622,558,575 +Luxembourg,Luxembourg,Male,694,722,660,740,650 +Luxembourg,Luxembourg,Female,607,586,535,591,549 +Luxembourg,Netherlands,Male,160,165,147,141,167 +Luxembourg,Netherlands,Female,92,97,85,94,119 +Netherlands,Belgium,Male,1063,1141,1113,1364,1493 +Netherlands,Belgium,Female,980,1071,1181,1340,1449 +Netherlands,Luxembourg,Male,23,43,59,70,83 +Netherlands,Luxembourg,Female,24,34,46,60,97 +Netherlands,Netherlands,Male,19374,20037,21119,22707,23750 +Netherlands,Netherlands,Female,16945,17411,18084,19815,20894 diff --git a/larray/tests/data/demography_eurostat/pop.csv b/larray/tests/data/demography_eurostat/pop.csv new file mode 100644 index 000000000..49dc24bf9 --- /dev/null +++ b/larray/tests/data/demography_eurostat/pop.csv @@ -0,0 +1,7 @@ +country,gender\time,2013,2014,2015,2016,2017 +Belgium,Male,5472856,5493792,5524068,5569264,5589272 +Belgium,Female,5665118,5687048,5713206,5741853,5762455 +France,Male,31772665,32045129,32174258,32247386,32318973 +France,Female,33827685,34120851,34283895,34391005,34485148 +Germany,Male,39380976,39556923,39835457,40514123,40697118 +Germany,Female,41142770,41210540,41362080,41661561,41824535 diff --git a/larray/tests/data/demography_eurostat/pop_benelux.csv b/larray/tests/data/demography_eurostat/pop_benelux.csv new file mode 100644 index 000000000..3aca52147 --- /dev/null +++ b/larray/tests/data/demography_eurostat/pop_benelux.csv @@ -0,0 +1,7 @@ +country,gender\time,2013,2014,2015,2016,2017 +Belgium,Male,5472856,5493792,5524068,5569264,5589272 +Belgium,Female,5665118,5687048,5713206,5741853,5762455 +Luxembourg,Male,268412,275117,281972,289193,296641 +Luxembourg,Female,268627,274563,280986,287056,294026 +Netherlands,Male,8307339,8334385,8372858,8417135,8475102 +Netherlands,Female,8472236,8494904,8527868,8561985,8606405 diff --git a/larray/tests/data/examples.h5 b/larray/tests/data/examples.h5 index c2e1fe570..c020696fe 100644 Binary files a/larray/tests/data/examples.h5 and b/larray/tests/data/examples.h5 differ diff --git a/larray/tests/data/examples.xlsx b/larray/tests/data/examples.xlsx index 5a409c42c..cd6ef9161 100644 Binary files a/larray/tests/data/examples.xlsx and b/larray/tests/data/examples.xlsx differ diff --git a/larray/tests/data/examples/immigration.csv b/larray/tests/data/examples/immigration.csv new file mode 100644 index 000000000..eb4331e7d --- /dev/null +++ b/larray/tests/data/examples/immigration.csv @@ -0,0 +1,19 @@ +country,citizenship,gender\time,2013,2014,2015 +Belgium,Belgium,Male,8822,10512,11378 +Belgium,Belgium,Female,5727,6301,6486 +Belgium,Luxembourg,Male,102,117,105 +Belgium,Luxembourg,Female,117,123,114 +Belgium,Netherlands,Male,4185,4222,4183 +Belgium,Netherlands,Female,3737,3844,3942 +Luxembourg,Belgium,Male,896,937,880 +Luxembourg,Belgium,Female,574,655,622 +Luxembourg,Luxembourg,Male,694,722,660 +Luxembourg,Luxembourg,Female,607,586,535 +Luxembourg,Netherlands,Male,160,165,147 +Luxembourg,Netherlands,Female,92,97,85 +Netherlands,Belgium,Male,1063,1141,1113 +Netherlands,Belgium,Female,980,1071,1181 +Netherlands,Luxembourg,Male,23,43,59 +Netherlands,Luxembourg,Female,24,34,46 +Netherlands,Netherlands,Male,19374,20037,21119 +Netherlands,Netherlands,Female,16945,17411,18084 diff --git a/larray/tests/data/examples/pop.csv b/larray/tests/data/examples/pop.csv index 4bc913d7d..adcf4e093 100644 --- a/larray/tests/data/examples/pop.csv +++ b/larray/tests/data/examples/pop.csv @@ -1,7 +1,7 @@ country,gender\time,2013,2014,2015 Belgium,Male,5472856,5493792,5524068 Belgium,Female,5665118,5687048,5713206 -France,Male,31772665,31936596,32175328 -France,Female,33827685,34005671,34280951 +France,Male,31772665,32045129,32174258 +France,Female,33827685,34120851,34283895 Germany,Male,39380976,39556923,39835457 Germany,Female,41142770,41210540,41362080 diff --git a/larray/tests/data/examples/pop_missing_axis_name.csv b/larray/tests/data/examples/pop_missing_axis_name.csv index ab80633d6..9b917b13d 100644 --- a/larray/tests/data/examples/pop_missing_axis_name.csv +++ b/larray/tests/data/examples/pop_missing_axis_name.csv @@ -1,7 +1,7 @@ country,gender,2013,2014,2015 Belgium,Male,5472856,5493792,5524068 Belgium,Female,5665118,5687048,5713206 -France,Male,31772665,31936596,32175328 -France,Female,33827685,34005671,34280951 +France,Male,31772665,32045129,32174258 +France,Female,33827685,34120851,34283895 Germany,Male,39380976,39556923,39835457 Germany,Female,41142770,41210540,41362080 diff --git a/larray/tests/data/examples/pop_missing_values.csv b/larray/tests/data/examples/pop_missing_values.csv index 4f9647ff0..1167018d0 100644 --- a/larray/tests/data/examples/pop_missing_values.csv +++ b/larray/tests/data/examples/pop_missing_values.csv @@ -1,5 +1,5 @@ country,gender\time,2013,2014,2015 Belgium,Male,5472856,5493792,5524068 Belgium,Female,5665118,5687048,5713206 -France,Female,33827685,34005671,34280951 +France,Female,33827685,34120851,34283895 Germany,Male,39380976,39556923,39835457 diff --git a/larray/tests/data/examples/pop_narrow_format.csv b/larray/tests/data/examples/pop_narrow_format.csv index 6c68bd758..ad804bf19 100644 --- a/larray/tests/data/examples/pop_narrow_format.csv +++ b/larray/tests/data/examples/pop_narrow_format.csv @@ -3,5 +3,5 @@ Belgium,2013,11137974 Belgium,2014,11180840 Belgium,2015,11237274 France,2013,65600350 -France,2014,65942267 -France,2015,66456279 +France,2014,66165980 +France,2015,66458153 diff --git a/larray/tests/data/pop_only.xlsx b/larray/tests/data/pop_only.xlsx index ccaacacb1..153afd30a 100644 Binary files a/larray/tests/data/pop_only.xlsx and b/larray/tests/data/pop_only.xlsx differ diff --git a/larray/tests/data/population_session.h5 b/larray/tests/data/population_session.h5 deleted file mode 100644 index 24f3adf29..000000000 Binary files a/larray/tests/data/population_session.h5 and /dev/null differ diff --git a/larray/tests/data/population_session.xlsx b/larray/tests/data/population_session.xlsx deleted file mode 100644 index 00bd78f01..000000000 Binary files a/larray/tests/data/population_session.xlsx and /dev/null differ diff --git a/larray/tests/data/population_session/__axes__.csv b/larray/tests/data/population_session/__axes__.csv deleted file mode 100644 index a05472fe4..000000000 --- a/larray/tests/data/population_session/__axes__.csv +++ /dev/null @@ -1,4 +0,0 @@ -country,gender,time -Belgium,Male,2013 -France,Female,2014 -Germany,,2015 diff --git a/larray/tests/data/population_session/births.csv b/larray/tests/data/population_session/births.csv deleted file mode 100644 index e00b6f566..000000000 --- a/larray/tests/data/population_session/births.csv +++ /dev/null @@ -1,7 +0,0 @@ -country,gender\time,2013,2014,2015 -Belgium,Male,64371,64173,62561 -Belgium,Female,61235,60841,59713 -France,Male,415762,418721,409145 -France,Female,396581,400607,390526 -Germany,Male,349820,366835,378478 -Germany,Female,332249,348092,359097 diff --git a/larray/tests/data/population_session/deaths.csv b/larray/tests/data/population_session/deaths.csv deleted file mode 100644 index 5bc6ece8c..000000000 --- a/larray/tests/data/population_session/deaths.csv +++ /dev/null @@ -1,7 +0,0 @@ -country,gender\time,2013,2014,2015 -Belgium,Male,53908,51579,53631 -Belgium,Female,55426,53176,56910 -France,Male,287410,282381,297028 -France,Female,281955,277054,296779 -Germany,Male,429645,422225,449512 -Germany,Female,464180,446131,475688 diff --git a/larray/tests/data/population_session/pop.csv b/larray/tests/data/population_session/pop.csv deleted file mode 100644 index 4bc913d7d..000000000 --- a/larray/tests/data/population_session/pop.csv +++ /dev/null @@ -1,7 +0,0 @@ -country,gender\time,2013,2014,2015 -Belgium,Male,5472856,5493792,5524068 -Belgium,Female,5665118,5687048,5713206 -France,Male,31772665,31936596,32175328 -France,Female,33827685,34005671,34280951 -Germany,Male,39380976,39556923,39835457 -Germany,Female,41142770,41210540,41362080 diff --git a/larray/tests/data/test.xlsx b/larray/tests/data/test.xlsx index 61b849a26..6d9b4e2d9 100644 Binary files a/larray/tests/data/test.xlsx and b/larray/tests/data/test.xlsx differ diff --git a/larray/tests/data/test2d_classic.csv b/larray/tests/data/test2d_classic.csv index a5b63f948..28c1d639e 100644 --- a/larray/tests/data/test2d_classic.csv +++ b/larray/tests/data/test2d_classic.csv @@ -1,4 +1,4 @@ a,b0,b1,b2 a0,0,1,2 a1,3,4,5 -a2,6,7,8 \ No newline at end of file +a2,6,7,8 diff --git a/larray/tests/data/test2d_classic_narrow.csv b/larray/tests/data/test2d_classic_narrow.csv new file mode 100644 index 000000000..1e320d3e1 --- /dev/null +++ b/larray/tests/data/test2d_classic_narrow.csv @@ -0,0 +1,10 @@ +a,b,value +a0,b0,0 +a0,b1,1 +a0,b2,2 +a1,b0,3 +a1,b1,4 +a1,b2,5 +a2,b0,6 +a2,b1,7 +a2,b2,8 diff --git a/larray/tests/data/test_narrow.xlsx b/larray/tests/data/test_narrow.xlsx index 8422ace79..800f35d1b 100644 Binary files a/larray/tests/data/test_narrow.xlsx and b/larray/tests/data/test_narrow.xlsx differ diff --git a/larray/tests/data/testint_labels_narrow.csv b/larray/tests/data/testint_labels_narrow.csv new file mode 100644 index 000000000..94cf238e2 --- /dev/null +++ b/larray/tests/data/testint_labels_narrow.csv @@ -0,0 +1,28 @@ +a,b,c,value +0,0,0,0 +0,0,1,1 +0,0,2,2 +0,1,0,3 +0,1,1,4 +0,1,2,5 +0,2,0,6 +0,2,1,7 +0,2,2,8 +1,0,0,9 +1,0,1,10 +1,0,2,11 +1,1,0,12 +1,1,1,13 +1,1,2,14 +1,2,0,15 +1,2,1,16 +1,2,2,17 +2,0,0,18 +2,0,1,19 +2,0,2,20 +2,1,0,21 +2,1,1,22 +2,1,2,23 +2,2,0,24 +2,2,1,25 +2,2,2,26 diff --git a/larray/tests/data/testunsorted.csv b/larray/tests/data/testunsorted.csv new file mode 100644 index 000000000..2d2e37a22 --- /dev/null +++ b/larray/tests/data/testunsorted.csv @@ -0,0 +1,7 @@ +a,b\c,c2,c1,c0 +3,b1,0,1,2 +3,b0,3,4,5 +2,b1,6,7,8 +2,b0,9,10,11 +1,b1,12,13,14 +1,b0,15,16,17 diff --git a/larray/tests/generate_data.py b/larray/tests/generate_data.py new file mode 100644 index 000000000..9459dcac4 --- /dev/null +++ b/larray/tests/generate_data.py @@ -0,0 +1,194 @@ +import os + +from larray import ndtest, open_excel, Session, X + + +DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') + + +def generate_tests_files(): + tests = {'1d': 3, + '2d': "a=1..3; b=b0,b1", + '2d_classic': "a=a0..a2;b=b0..b2", + '3d': "a=1..3; b=b0,b1; c=c0..c2", + 'int_labels': "a=0..2; b=0..2; c=0..2", + 'missing_values': "a=1..3; b=b0,b1; c=c0..c2", + 'unsorted': "a=3..1; b=b1,b0; c=c2..c0", + 'position': "a=1..3; b=b0,b1; c=c0..c2"} + + wb = open_excel(os.path.join(DATA_DIR, 'test.xlsx'), overwrite_file=True) + wb_narrow = open_excel(os.path.join(DATA_DIR, 'test_narrow.xlsx'), overwrite_file=True) + + for name, dim in tests.items(): + arr = ndtest(dim) + if name == '2d_classic': + df = arr.to_frame(fold_last_axis_name=False) + # wide format + df.to_csv(os.path.join(DATA_DIR, 'test{}.csv'.format(name)), sep=',', na_rep='') + wb[name] = '' + wb[name]['A1'].options().value = df + # narrow format + df = arr.to_series(name='value') + df.to_csv(os.path.join(DATA_DIR, 'test{}_narrow.csv'.format(name)), sep=',', na_rep='', header=True) + wb_narrow[name] = '' + wb_narrow[name]['A1'].options().value = df + elif name == 'missing_values': + df = arr.to_frame(fold_last_axis_name=True) + # wide format + df = df.drop([(2, 'b0'), (3, 'b1')]) + df.to_csv(os.path.join(DATA_DIR, 'test{}.csv'.format(name)), sep=',', na_rep='') + wb[name] = '' + wb[name]['A1'].options().value = df + # narrow format + df = arr.to_series(name='value') + df = df.drop([(2, 'b0'), (2, 'b1', 'c1'), (3, 'b1')]) + df.to_csv(os.path.join(DATA_DIR, 'test{}_narrow.csv'.format(name)), sep=',', na_rep='', header=True) + wb_narrow[name] = '' + wb_narrow[name]['A1'].options().value = df + elif name == 'position': + # wide format + wb[name] = '' + wb[name]['D3'] = arr.dump() + # narrow format + wb_narrow[name] = '' + wb_narrow[name]['D3'] = arr.dump(wide=False) + else: + # wide format + arr.to_csv(os.path.join(DATA_DIR, 'test{}.csv'.format(name))) + wb[name] = arr.dump() + # narrow format + arr.to_csv(os.path.join(DATA_DIR, 'test{}_narrow.csv'.format(name)), wide=False) + wb_narrow[name] = arr.dump(wide=False) + + wb.save() + wb.close() + wb_narrow.save() + wb_narrow.close() + + +def generate_example_files(csv=True, excel=True, hdf5=True): + from larray_eurostat import eurostat_get + + def prepare_eurostat_data(dataset_name, countries): + arr = eurostat_get(dataset_name)[X.unit['NR'], X.age['TOTAL'], X.sex['M,F']] + arr = arr[X.time[::-1]][2013:2017] + arr = arr.rename('sex', 'gender') + arr = arr.set_labels(gender='Male,Female') + arr = arr.rename('geo', 'country') + country_codes = list(countries.keys()) + country_names = list(countries.values()) + if dataset_name == 'migr_imm1ctz': + # example of an array with ambiguous axes + arr = arr['COMPLET', X.citizen[country_codes], X.country[country_codes]].astype(int) + arr = arr.rename('citizen', 'citizenship') + arr = arr.set_labels('citizenship', country_names) + arr = arr.set_labels('country', country_names) + arr = arr.transpose('country', 'citizenship', 'gender', 'time') + else: + arr = arr[country_codes].astype(int) + arr = arr.set_labels('country', country_names) + arr = arr.transpose('country', 'gender', 'time') + return arr + + countries = {'BE': 'Belgium', 'FR': 'France', 'DE': 'Germany'} + benelux = {'BE': 'Belgium', 'LU': 'Luxembourg', 'NL': 'Netherlands'} + + # Arrays + pop = prepare_eurostat_data('demo_pjan', countries) + pop.meta.title = 'Population on 1 January by age and sex' + pop.meta.source = 'table demo_pjan from Eurostat' + # ---- + pop_benelux = prepare_eurostat_data('demo_pjan', benelux) + pop_benelux.meta.title = 'Population on 1 January by age and sex (Benelux)' + pop_benelux.meta.source = 'table demo_pjan from Eurostat' + # ---- + births = prepare_eurostat_data('demo_fasec', countries) + births.meta.title = "Live births by mother's age and newborn's sex" + births.meta.source = 'table demo_fasec from Eurostat' + # ---- + deaths = prepare_eurostat_data('demo_magec', countries) + deaths.meta.title = 'Deaths by age and sex' + deaths.meta.source = 'table demo_magec from Eurostat' + # ---- + immigration = prepare_eurostat_data('migr_imm1ctz', benelux) + immigration.meta.title = 'Immigration by age group, sex and citizenship' + immigration.meta.source = 'table migr_imm1ctz from Eurostat' + + # Groups + even_years = pop.time[2014::2] >> 'even_years' + odd_years = pop.time[2013::2] >> 'odd_years' + + # Session + ses = Session({'country': pop.country, 'country_benelux': immigration.country, + 'citizenship': immigration.citizenship, + 'gender': pop.gender, 'time': pop.time, + 'even_years': even_years, 'odd_years': odd_years, + 'pop': pop, 'pop_benelux': pop_benelux, 'births': births, 'deaths': deaths, + 'immigration': immigration}) + ses.meta.title = 'Demographic datasets for a small selection of countries in Europe' + ses.meta.source = 'demo_jpan, demo_fasec, demo_magec and migr_imm1ctz tables from Eurostat' + + # EUROSTAT DATASET + + if csv: + ses.save(os.path.join(DATA_DIR, 'demography_eurostat')) + if excel: + ses.save(os.path.join(DATA_DIR, 'demography_eurostat.xlsx')) + if hdf5: + ses.save(os.path.join(DATA_DIR, 'demography_eurostat.h5')) + + # EXAMPLE FILES + + years = pop.time[2013:2015] + pop = pop[years] + pop_narrow = pop['Belgium,France'].sum('gender') + births = births[years] + deaths = deaths[years] + immigration = immigration[years] + + # Dataframes (for testing missing axis/values) + df_missing_axis_name = pop.to_frame(fold_last_axis_name=False) + df_missing_values = pop.to_frame(fold_last_axis_name=True) + df_missing_values.drop([('France', 'Male'), ('Germany', 'Female')], inplace=True) + + if csv: + examples_dir = os.path.join(DATA_DIR, 'examples') + pop.to_csv(os.path.join(examples_dir, 'pop.csv')) + births.to_csv(os.path.join(examples_dir, 'births.csv')) + deaths.to_csv(os.path.join(examples_dir, 'deaths.csv')) + immigration.to_csv(os.path.join(examples_dir, 'immigration.csv')) + df_missing_axis_name.to_csv(os.path.join(examples_dir, 'pop_missing_axis_name.csv'), sep=',', na_rep='') + df_missing_values.to_csv(os.path.join(examples_dir, 'pop_missing_values.csv'), sep=',', na_rep='') + pop_narrow.to_csv(os.path.join(examples_dir, 'pop_narrow_format.csv'), wide=False) + + if excel: + with open_excel(os.path.join(DATA_DIR, 'examples.xlsx'), overwrite_file=True) as wb: + wb['pop'] = pop.dump() + wb['births'] = births.dump() + wb['deaths'] = deaths.dump() + wb['immigration'] = immigration.dump() + wb['pop_births_deaths'] = pop.dump() + wb['pop_births_deaths']['A9'] = births.dump() + wb['pop_births_deaths']['A17'] = deaths.dump() + wb['pop_missing_axis_name'] = '' + wb['pop_missing_axis_name']['A1'].options().value = df_missing_axis_name + wb['pop_missing_values'] = '' + wb['pop_missing_values']['A1'].options().value = df_missing_values + # wb['pop_narrow_format'] = pop_narrow.dump(wide=False) + wb.save() + pop_narrow.to_excel(os.path.join(DATA_DIR, 'examples.xlsx'), 'pop_narrow_format', wide=False) + Session({'country': pop.country, 'gender': pop.gender, 'time': pop.time, + 'pop': pop}).save(os.path.join(DATA_DIR, 'pop_only.xlsx')) + Session({'births': births, 'deaths': deaths}).save(os.path.join(DATA_DIR, 'births_and_deaths.xlsx')) + + if hdf5: + examples_h5_file = os.path.join(DATA_DIR, 'examples.h5') + pop.to_hdf(examples_h5_file, 'pop') + births.to_hdf(examples_h5_file, 'births') + deaths.to_hdf(examples_h5_file, 'deaths') + immigration.to_hdf(examples_h5_file, 'immigration') + + +if __name__ == '__main__': + # generate_tests_files() + generate_example_files() diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 0615f987e..10871e498 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -2484,15 +2484,15 @@ def test_percent(array): reg = array.sum(age, sex, regions) percent = reg.percent() - assert_array_equal(percent, reg * 100 / reg.sum(geo, lipro)) + assert_array_equal(percent, (reg * 100.0 / reg.sum(geo, lipro))) assert percent.shape == (3, 15) percent = reg.percent(geo) - assert_array_equal(percent, reg * 100 / reg.sum(geo)) + assert_array_equal(percent, (reg * 100.0 / reg.sum(geo))) assert percent.shape == (3, 15) percent = reg.percent(geo, lipro) - assert_array_equal(percent, reg * 100 / reg.sum(geo, lipro)) + assert_array_equal(percent, (reg * 100.0 / reg.sum(geo, lipro))) assert percent.shape == (3, 15) assert round(abs(percent.sum() - 100.0), 7) == 0 diff --git a/larray/tests/test_excel.py b/larray/tests/test_excel.py index 06b991931..c5ec86221 100644 --- a/larray/tests/test_excel.py +++ b/larray/tests/test_excel.py @@ -338,7 +338,7 @@ def test_excel_report_titles(): @needs_xlwings def test_excel_report_arrays(): excel_report = ExcelReport(EXAMPLE_EXCEL_TEMPLATES_DIR) - demo = load_example_data('demo') + demo = load_example_data('demography_eurostat') pop = demo.pop pop_be = pop['Belgium'] pop_be_nan = pop_be.astype(float)