From f797864f98b5680a6d7ee29d2ddfbb9ab0366b99 Mon Sep 17 00:00:00 2001 From: JuanCarlos3 Date: Sun, 5 Oct 2025 20:56:38 -0700 Subject: [PATCH 1/4] Re-added contents of removed @Appender decorators as static docstrings. --- pandas/io/stata.py | 142 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 979f00973958b..7a34e0794c46e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1688,6 +1688,38 @@ def read( columns: Sequence[str] | None = None, order_categoricals: bool | None = None, ) -> DataFrame: + """ + Reads observations from Stata file, converting them into a dataframe + + Parameters + ---------- + nrows : int + Number of lines to read from data file, if None read whole file. + convert_dates : bool, default True + Convert date variables to DataFrame time values. + convert_categoricals : bool, default True + Read value labels and convert columns to Categorical/Factor variables. + index_col : str, optional + Column to set as index. + convert_missing : bool, default False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nan. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. + preserve_dtypes : bool, default True + Preserve Stata datatypes. If False, numeric data are upcast to pandas + default types for foreign data (float64 or int64). + columns : list or None + Columns to retain. Columns will be returned in the given order. None + returns all columns. + order_categoricals : bool, default True + Flag indicating whether converted categorical data are ordered. + + Returns + ------- + DataFrame + """ self._ensure_open() # Handle options @@ -2149,6 +2181,116 @@ def read_stata( compression: CompressionOptions = "infer", storage_options: StorageOptions | None = None, ) -> DataFrame | StataReader: + """ + Read Stata file into DataFrame. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.dta``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handle (e.g. via builtin ``open`` function) + or ``StringIO``. + convert_dates : bool, default True + Convert date variables to DataFrame time values. + convert_categoricals : bool, default True + Read value labels and convert columns to Categorical/Factor variables. + index_col : str, optional + Column to set as index. + convert_missing : bool, default False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nan. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. + preserve_dtypes : bool, default True + Preserve Stata datatypes. If False, numeric data are upcast to pandas + default types for foreign data (float64 or int64). + columns : list or None + Columns to retain. Columns will be returned in the given order. None + returns all columns. + order_categoricals : bool, default True + Flag indicating whether converted categorical data are ordered. + chunksize : int, default None + Return StataReader object for iterations, returns chunks with + given number of lines. + iterator : bool, default False + Return StataReader object. + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a + custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + + Returns + ------- + DataFrame, pandas.api.typing.StataReader + If iterator or chunksize, returns StataReader, else DataFrame. + + See Also + -------- + io.stata.StataReader : Low-level reader for Stata data files. + DataFrame.to_stata: Export Stata data files. + + Notes + ----- + Categorical variables read through an iterator may not have the same + categories and dtype. This occurs when a variable stored in a DTA + file is associated to an incomplete set of value labels that only + label a strict subset of the values. + + Examples + -------- + + Creating a dummy stata for this example + + >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', 'parrot'], + ... 'speed': [350, 18, 361, 15]}) # doctest: +SKIP + >>> df.to_stata('animals.dta') # doctest: +SKIP + + Read a Stata dta file: + + >>> df = pd.read_stata('animals.dta') # doctest: +SKIP + + Read a Stata dta file in 10,000 line chunks: + + >>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP + >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP + >>> df.to_stata('filename.dta') # doctest: +SKIP + + >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP + >>> for chunk in itr: + ... # Operate on a single chunk, e.g., chunk.mean() + ... pass # doctest: +SKIP + """ reader = StataReader( filepath_or_buffer, convert_dates=convert_dates, From 42d066a163e52cd565c5da97b214f2635b9f1b03 Mon Sep 17 00:00:00 2001 From: JuanCarlos3 Date: Sun, 5 Oct 2025 21:15:47 -0700 Subject: [PATCH 2/4] Removed usages of Appender: - Removed import of @appender - Removed all usages of @appender decorators --- pandas/io/stata.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7a34e0794c46e..218327cba7f3c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -44,7 +44,6 @@ ValueLabelTypeMismatch, ) from pandas.util._decorators import ( - Appender, doc, ) from pandas.util._exceptions import find_stack_level @@ -1676,7 +1675,6 @@ def get_chunk(self, size: int | None = None) -> DataFrame: size = self._chunksize return self.read(nrows=size) - @Appender(_read_method_doc) def read( self, nrows: int | None = None, @@ -2165,7 +2163,6 @@ def value_labels(self) -> dict[str, dict[int, str]]: return self._value_label_dict -@Appender(_read_stata_doc) def read_stata( filepath_or_buffer: FilePath | ReadBuffer[bytes], *, From 57c530f4e6f7978001ca2b4cbaa3e9f4b72421a8 Mon Sep 17 00:00:00 2001 From: JuanCarlos3 Date: Tue, 7 Oct 2025 16:57:34 -0700 Subject: [PATCH 3/4] Fix issue with line length violations caught by Ruff. --- pandas/io/stata.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4f09153ef5503..c9aa4410a5f42 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2221,14 +2221,14 @@ def read_stata( iterator : bool, default False Return StataReader object. compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is - path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' - (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. - Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', + '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one + data file to be read in. Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set to one of + {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and other key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or @@ -2271,19 +2271,25 @@ def read_stata( Creating a dummy stata for this example - >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', 'parrot'], - ... 'speed': [350, 18, 361, 15]}) # doctest: +SKIP - >>> df.to_stata('animals.dta') # doctest: +SKIP + >>> df = pd.DataFrame( + ... { + ... "animal": ["falcon", "parrot", "falcon", "parrot"], + ... "speed": [350, 18, 361, 15], + ... } + ... ) # doctest: +SKIP + >>> df.to_stata("animals.dta") # doctest: +SKIP Read a Stata dta file: - >>> df = pd.read_stata('animals.dta') # doctest: +SKIP + >>> df = pd.read_stata("animals.dta") # doctest: +SKIP Read a Stata dta file in 10,000 line chunks: - >>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP + >>> values = np.random.randint( + ... 0, 10, size=(20_000, 1), dtype="uint8" + ... ) # doctest: +SKIP >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP - >>> df.to_stata('filename.dta') # doctest: +SKIP + >>> df.to_stata("filename.dta") # doctest: +SKIP >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP >>> for chunk in itr: From bb7d34a29b6a0365fe2f761dc719a66a914ac343 Mon Sep 17 00:00:00 2001 From: JuanCarlos3 Date: Wed, 8 Oct 2025 18:29:55 -0700 Subject: [PATCH 4/4] Remove dead code left after @Appender removal in stata.py --- pandas/io/stata.py | 78 ---------------------------------------------- 1 file changed, 78 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c9aa4410a5f42..1ed6f2c172787 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -126,10 +126,6 @@ Return StataReader object for iterations, returns chunks with given number of lines.""" -_iterator_params = """\ -iterator : bool, default False - Return StataReader object.""" - _reader_notes = """\ Notes ----- @@ -138,80 +134,6 @@ file is associated to an incomplete set of value labels that only label a strict subset of the values.""" -_read_stata_doc = f""" -Read Stata file into DataFrame. - -Parameters ----------- -filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: ``file://localhost/path/to/table.dta``. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. -{_statafile_processing_params1} -{_statafile_processing_params2} -{_chunksize_params} -{_iterator_params} -{_shared_docs["decompression_options"] % "filepath_or_buffer"} -{_shared_docs["storage_options"]} - -Returns -------- -DataFrame, pandas.api.typing.StataReader - If iterator or chunksize, returns StataReader, else DataFrame. - -See Also --------- -io.stata.StataReader : Low-level reader for Stata data files. -DataFrame.to_stata: Export Stata data files. - -{_reader_notes} - -Examples --------- - -Creating a dummy stata for this example - ->>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'], -... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP ->>> df.to_stata('animals.dta') # doctest: +SKIP - -Read a Stata dta file: - ->>> df = pd.read_stata('animals.dta') # doctest: +SKIP - -Read a Stata dta file in 10,000 line chunks: - ->>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP ->>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP ->>> df.to_stata('filename.dta') # doctest: +SKIP - ->>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP ->>> for chunk in itr: -... # Operate on a single chunk, e.g., chunk.mean() -... pass # doctest: +SKIP -""" - -_read_method_doc = f"""\ -Reads observations from Stata file, converting them into a dataframe - -Parameters ----------- -nrows : int - Number of lines to read from data file, if None read whole file. -{_statafile_processing_params1} -{_statafile_processing_params2} - -Returns -------- -DataFrame -""" - _stata_reader_doc = f"""\ Class for reading Stata dta files.