implemented read_stata and LArray.to_stata

gdementen · gdementen · commit 2f2449d31ebc · 2019-06-26T15:58:07.000+02:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -625,6 +625,7 @@ Read
    read_hdf
    read_eurostat
    read_sas
+   read_stata
 
 Write
 -----
@@ -635,6 +636,7 @@ Write
    LArray.to_csv
    LArray.to_excel
    LArray.to_hdf
+   LArray.to_stata
 
 Excel
 =====
diff --git a/doc/source/changes/version_0_30.rst.inc b/doc/source/changes/version_0_30.rst.inc
@@ -101,6 +101,9 @@ Backward incompatible changes
 New features
 ^^^^^^^^^^^^
 
+* implemented :py:obj:`read_stata()` and :py:obj:`LArray.to_stata()` to read arrays from and write arrays to Stata .dta
+  files.
+
 * added :py:obj:`LArray.isin()` method to check whether each element of an array is contained in a list (or array) of
   values.
 
diff --git a/larray/__init__.py b/larray/__init__.py
@@ -28,6 +28,7 @@
 from larray.inout.excel import read_excel
 from larray.inout.hdf import read_hdf
 from larray.inout.sas import read_sas
+from larray.inout.stata import read_stata
 from larray.inout.xw_excel import open_excel, Workbook
 
 # just make sure handlers for .pkl and .pickle are initialized
@@ -74,7 +75,7 @@
     'real_if_close', 'interp', 'isnan', 'isinf', 'inverse',
     # inout
     'from_lists', 'from_string', 'from_frame', 'from_series', 'read_csv', 'read_tsv',
-    'read_eurostat', 'read_excel', 'read_hdf', 'read_sas', 'open_excel', 'Workbook',
+    'read_eurostat', 'read_excel', 'read_hdf', 'read_sas', 'read_stata', 'open_excel', 'Workbook',
     # utils
     'get_options', 'set_options',
     # viewer
diff --git a/larray/core/array.py b/larray/core/array.py
@@ -6266,6 +6266,38 @@ def to_hdf(self, filepath, key):
             attrs.writer = 'LArray'
             self.meta.to_hdf(store, key)
 
+    def to_stata(self, filepath_or_buffer, **kwargs):
+        r"""
+        Writes array to a Stata .dta file.
+
+        Parameters
+        ----------
+        filepath_or_buffer : str or file-like object
+            Path to .dta file or a file handle.
+
+        See Also
+        --------
+        read_stata
+
+        Notes
+        -----
+        The round trip to Stata (LArray.to_stata followed by read_stata) loose the name of the "column" axis.
+
+        Examples
+        --------
+        >>> axes = [Axis(3, 'row'), Axis('column=country,sex')]    # doctest: +SKIP
+        >>> arr = LArray([['BE', 'F'],
+        ...               ['FR', 'M'],
+        ...               ['FR', 'F']], axes=axes)                 # doctest: +SKIP
+        >>> arr                                                    # doctest: +SKIP
+        row*\column  age  sex
+                  0    5    F
+                  1   25    M
+                  2   30    F
+        >>> arr.to_stata('test.dta')                               # doctest: +SKIP
+        """
+        self.to_frame().to_stata(filepath_or_buffer, **kwargs)
+
     @deprecate_kwarg('sheet_name', 'sheet')
     def to_excel(self, filepath=None, sheet=None, position='A1', overwrite_file=False, clear_sheet=False,
                  header=True, transpose=False, wide=True, value_name='value', engine=None, *args, **kwargs):
diff --git a/larray/inout/stata.py b/larray/inout/stata.py
@@ -0,0 +1,53 @@
+from __future__ import absolute_import, print_function
+
+import pandas as pd
+
+from larray.inout.pandas import from_frame
+
+__all__ = ['read_stata']
+
+
+def read_stata(filepath_or_buffer, index_col=None, sort_rows=False, sort_columns=False, **kwargs):
+    r"""
+    Reads Stata .dta file and returns an LArray with the contents
+
+    Parameters
+    ----------
+    filepath_or_buffer : str or file-like object
+        Path to .dta file or a file handle.
+    index_col : str or None, optional
+        Name of column to set as index. Defaults to None.
+    sort_rows : bool, optional
+        Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting).
+        This only makes sense in combination with index_col. Defaults to False.
+    sort_columns : bool, optional
+        Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
+        Defaults to False.
+
+    Returns
+    -------
+    LArray
+
+    See Also
+    --------
+    LArray.to_stata
+
+    Notes
+    -----
+    The round trip to Stata (LArray.to_stata followed by read_stata) loose the name of the "column" axis.
+
+    Examples
+    --------
+    >>> read_stata('test.dta')                   # doctest: +SKIP
+    {0}\{1}  row  country  sex
+          0    0       BE    F
+          1    1       FR    M
+          2    2       FR    F
+    >>> read_stata('test.dta', index_col='row')  # doctest: +SKIP
+    row\{1}  country  sex
+          0       BE    F
+          1       FR    M
+          2       FR    F
+    """
+    df = pd.read_stata(filepath_or_buffer, index_col=index_col, **kwargs)
+    return from_frame(df, sort_rows=sort_rows, sort_columns=sort_columns)