diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 69205d6bebb65..1ed6f2c172787 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -44,7 +44,6 @@ ValueLabelTypeMismatch, ) from pandas.util._decorators import ( - Appender, doc, set_module, ) @@ -127,10 +126,6 @@ Return StataReader object for iterations, returns chunks with given number of lines.""" -_iterator_params = """\ -iterator : bool, default False - Return StataReader object.""" - _reader_notes = """\ Notes ----- @@ -139,80 +134,6 @@ file is associated to an incomplete set of value labels that only label a strict subset of the values.""" -_read_stata_doc = f""" -Read Stata file into DataFrame. - -Parameters ----------- -filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: ``file://localhost/path/to/table.dta``. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. -{_statafile_processing_params1} -{_statafile_processing_params2} -{_chunksize_params} -{_iterator_params} -{_shared_docs["decompression_options"] % "filepath_or_buffer"} -{_shared_docs["storage_options"]} - -Returns -------- -DataFrame, pandas.api.typing.StataReader - If iterator or chunksize, returns StataReader, else DataFrame. - -See Also --------- -io.stata.StataReader : Low-level reader for Stata data files. -DataFrame.to_stata: Export Stata data files. - -{_reader_notes} - -Examples --------- - -Creating a dummy stata for this example - ->>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'], -... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP ->>> df.to_stata('animals.dta') # doctest: +SKIP - -Read a Stata dta file: - ->>> df = pd.read_stata('animals.dta') # doctest: +SKIP - -Read a Stata dta file in 10,000 line chunks: - ->>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP ->>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP ->>> df.to_stata('filename.dta') # doctest: +SKIP - ->>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP ->>> for chunk in itr: -... # Operate on a single chunk, e.g., chunk.mean() -... pass # doctest: +SKIP -""" - -_read_method_doc = f"""\ -Reads observations from Stata file, converting them into a dataframe - -Parameters ----------- -nrows : int - Number of lines to read from data file, if None read whole file. -{_statafile_processing_params1} -{_statafile_processing_params2} - -Returns -------- -DataFrame -""" - _stata_reader_doc = f"""\ Class for reading Stata dta files. @@ -1677,7 +1598,6 @@ def get_chunk(self, size: int | None = None) -> DataFrame: size = self._chunksize return self.read(nrows=size) - @Appender(_read_method_doc) def read( self, nrows: int | None = None, @@ -1689,6 +1609,38 @@ def read( columns: Sequence[str] | None = None, order_categoricals: bool | None = None, ) -> DataFrame: + """ + Reads observations from Stata file, converting them into a dataframe + + Parameters + ---------- + nrows : int + Number of lines to read from data file, if None read whole file. + convert_dates : bool, default True + Convert date variables to DataFrame time values. + convert_categoricals : bool, default True + Read value labels and convert columns to Categorical/Factor variables. + index_col : str, optional + Column to set as index. + convert_missing : bool, default False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nan. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. + preserve_dtypes : bool, default True + Preserve Stata datatypes. If False, numeric data are upcast to pandas + default types for foreign data (float64 or int64). + columns : list or None + Columns to retain. Columns will be returned in the given order. None + returns all columns. + order_categoricals : bool, default True + Flag indicating whether converted categorical data are ordered. + + Returns + ------- + DataFrame + """ self._ensure_open() # Handle options @@ -2135,7 +2087,6 @@ def value_labels(self) -> dict[str, dict[int, str]]: @set_module("pandas") -@Appender(_read_stata_doc) def read_stata( filepath_or_buffer: FilePath | ReadBuffer[bytes], *, @@ -2151,6 +2102,122 @@ def read_stata( compression: CompressionOptions = "infer", storage_options: StorageOptions | None = None, ) -> DataFrame | StataReader: + """ + Read Stata file into DataFrame. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.dta``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handle (e.g. via builtin ``open`` function) + or ``StringIO``. + convert_dates : bool, default True + Convert date variables to DataFrame time values. + convert_categoricals : bool, default True + Read value labels and convert columns to Categorical/Factor variables. + index_col : str, optional + Column to set as index. + convert_missing : bool, default False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nan. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. + preserve_dtypes : bool, default True + Preserve Stata datatypes. If False, numeric data are upcast to pandas + default types for foreign data (float64 or int64). + columns : list or None + Columns to retain. Columns will be returned in the given order. None + returns all columns. + order_categoricals : bool, default True + Flag indicating whether converted categorical data are ordered. + chunksize : int, default None + Return StataReader object for iterations, returns chunks with + given number of lines. + iterator : bool, default False + Return StataReader object. + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', + '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one + data file to be read in. Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set to one of + {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a + custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + + Returns + ------- + DataFrame, pandas.api.typing.StataReader + If iterator or chunksize, returns StataReader, else DataFrame. + + See Also + -------- + io.stata.StataReader : Low-level reader for Stata data files. + DataFrame.to_stata: Export Stata data files. + + Notes + ----- + Categorical variables read through an iterator may not have the same + categories and dtype. This occurs when a variable stored in a DTA + file is associated to an incomplete set of value labels that only + label a strict subset of the values. + + Examples + -------- + + Creating a dummy stata for this example + + >>> df = pd.DataFrame( + ... { + ... "animal": ["falcon", "parrot", "falcon", "parrot"], + ... "speed": [350, 18, 361, 15], + ... } + ... ) # doctest: +SKIP + >>> df.to_stata("animals.dta") # doctest: +SKIP + + Read a Stata dta file: + + >>> df = pd.read_stata("animals.dta") # doctest: +SKIP + + Read a Stata dta file in 10,000 line chunks: + + >>> values = np.random.randint( + ... 0, 10, size=(20_000, 1), dtype="uint8" + ... ) # doctest: +SKIP + >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP + >>> df.to_stata("filename.dta") # doctest: +SKIP + + >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP + >>> for chunk in itr: + ... # Operate on a single chunk, e.g., chunk.mean() + ... pass # doctest: +SKIP + """ reader = StataReader( filepath_or_buffer, convert_dates=convert_dates,