Skip to content

Commit bfb740a

Browse files
authored
Replace appender in stata (#62621)
1 parent bbcf86f commit bfb740a

File tree

1 file changed

+148
-81
lines changed

1 file changed

+148
-81
lines changed

pandas/io/stata.py

Lines changed: 148 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
ValueLabelTypeMismatch,
4545
)
4646
from pandas.util._decorators import (
47-
Appender,
4847
doc,
4948
set_module,
5049
)
@@ -127,10 +126,6 @@
127126
Return StataReader object for iterations, returns chunks with
128127
given number of lines."""
129128

130-
_iterator_params = """\
131-
iterator : bool, default False
132-
Return StataReader object."""
133-
134129
_reader_notes = """\
135130
Notes
136131
-----
@@ -139,80 +134,6 @@
139134
file is associated to an incomplete set of value labels that only
140135
label a strict subset of the values."""
141136

142-
_read_stata_doc = f"""
143-
Read Stata file into DataFrame.
144-
145-
Parameters
146-
----------
147-
filepath_or_buffer : str, path object or file-like object
148-
Any valid string path is acceptable. The string could be a URL. Valid
149-
URL schemes include http, ftp, s3, and file. For file URLs, a host is
150-
expected. A local file could be: ``file://localhost/path/to/table.dta``.
151-
152-
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
153-
154-
By file-like object, we refer to objects with a ``read()`` method,
155-
such as a file handle (e.g. via builtin ``open`` function)
156-
or ``StringIO``.
157-
{_statafile_processing_params1}
158-
{_statafile_processing_params2}
159-
{_chunksize_params}
160-
{_iterator_params}
161-
{_shared_docs["decompression_options"] % "filepath_or_buffer"}
162-
{_shared_docs["storage_options"]}
163-
164-
Returns
165-
-------
166-
DataFrame, pandas.api.typing.StataReader
167-
If iterator or chunksize, returns StataReader, else DataFrame.
168-
169-
See Also
170-
--------
171-
io.stata.StataReader : Low-level reader for Stata data files.
172-
DataFrame.to_stata: Export Stata data files.
173-
174-
{_reader_notes}
175-
176-
Examples
177-
--------
178-
179-
Creating a dummy stata for this example
180-
181-
>>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'],
182-
... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP
183-
>>> df.to_stata('animals.dta') # doctest: +SKIP
184-
185-
Read a Stata dta file:
186-
187-
>>> df = pd.read_stata('animals.dta') # doctest: +SKIP
188-
189-
Read a Stata dta file in 10,000 line chunks:
190-
191-
>>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP
192-
>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
193-
>>> df.to_stata('filename.dta') # doctest: +SKIP
194-
195-
>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
196-
>>> for chunk in itr:
197-
... # Operate on a single chunk, e.g., chunk.mean()
198-
... pass # doctest: +SKIP
199-
"""
200-
201-
_read_method_doc = f"""\
202-
Reads observations from Stata file, converting them into a dataframe
203-
204-
Parameters
205-
----------
206-
nrows : int
207-
Number of lines to read from data file, if None read whole file.
208-
{_statafile_processing_params1}
209-
{_statafile_processing_params2}
210-
211-
Returns
212-
-------
213-
DataFrame
214-
"""
215-
216137
_stata_reader_doc = f"""\
217138
Class for reading Stata dta files.
218139
@@ -1677,7 +1598,6 @@ def get_chunk(self, size: int | None = None) -> DataFrame:
16771598
size = self._chunksize
16781599
return self.read(nrows=size)
16791600

1680-
@Appender(_read_method_doc)
16811601
def read(
16821602
self,
16831603
nrows: int | None = None,
@@ -1689,6 +1609,38 @@ def read(
16891609
columns: Sequence[str] | None = None,
16901610
order_categoricals: bool | None = None,
16911611
) -> DataFrame:
1612+
"""
1613+
Reads observations from Stata file, converting them into a dataframe
1614+
1615+
Parameters
1616+
----------
1617+
nrows : int
1618+
Number of lines to read from data file, if None read whole file.
1619+
convert_dates : bool, default True
1620+
Convert date variables to DataFrame time values.
1621+
convert_categoricals : bool, default True
1622+
Read value labels and convert columns to Categorical/Factor variables.
1623+
index_col : str, optional
1624+
Column to set as index.
1625+
convert_missing : bool, default False
1626+
Flag indicating whether to convert missing values to their Stata
1627+
representations. If False, missing values are replaced with nan.
1628+
If True, columns containing missing values are returned with
1629+
object data types and missing values are represented by
1630+
StataMissingValue objects.
1631+
preserve_dtypes : bool, default True
1632+
Preserve Stata datatypes. If False, numeric data are upcast to pandas
1633+
default types for foreign data (float64 or int64).
1634+
columns : list or None
1635+
Columns to retain. Columns will be returned in the given order. None
1636+
returns all columns.
1637+
order_categoricals : bool, default True
1638+
Flag indicating whether converted categorical data are ordered.
1639+
1640+
Returns
1641+
-------
1642+
DataFrame
1643+
"""
16921644
self._ensure_open()
16931645

16941646
# Handle options
@@ -2135,7 +2087,6 @@ def value_labels(self) -> dict[str, dict[int, str]]:
21352087

21362088

21372089
@set_module("pandas")
2138-
@Appender(_read_stata_doc)
21392090
def read_stata(
21402091
filepath_or_buffer: FilePath | ReadBuffer[bytes],
21412092
*,
@@ -2151,6 +2102,122 @@ def read_stata(
21512102
compression: CompressionOptions = "infer",
21522103
storage_options: StorageOptions | None = None,
21532104
) -> DataFrame | StataReader:
2105+
"""
2106+
Read Stata file into DataFrame.
2107+
2108+
Parameters
2109+
----------
2110+
filepath_or_buffer : str, path object or file-like object
2111+
Any valid string path is acceptable. The string could be a URL. Valid
2112+
URL schemes include http, ftp, s3, and file. For file URLs, a host is
2113+
expected. A local file could be: ``file://localhost/path/to/table.dta``.
2114+
2115+
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
2116+
2117+
By file-like object, we refer to objects with a ``read()`` method,
2118+
such as a file handle (e.g. via builtin ``open`` function)
2119+
or ``StringIO``.
2120+
convert_dates : bool, default True
2121+
Convert date variables to DataFrame time values.
2122+
convert_categoricals : bool, default True
2123+
Read value labels and convert columns to Categorical/Factor variables.
2124+
index_col : str, optional
2125+
Column to set as index.
2126+
convert_missing : bool, default False
2127+
Flag indicating whether to convert missing values to their Stata
2128+
representations. If False, missing values are replaced with nan.
2129+
If True, columns containing missing values are returned with
2130+
object data types and missing values are represented by
2131+
StataMissingValue objects.
2132+
preserve_dtypes : bool, default True
2133+
Preserve Stata datatypes. If False, numeric data are upcast to pandas
2134+
default types for foreign data (float64 or int64).
2135+
columns : list or None
2136+
Columns to retain. Columns will be returned in the given order. None
2137+
returns all columns.
2138+
order_categoricals : bool, default True
2139+
Flag indicating whether converted categorical data are ordered.
2140+
chunksize : int, default None
2141+
Return StataReader object for iterations, returns chunks with
2142+
given number of lines.
2143+
iterator : bool, default False
2144+
Return StataReader object.
2145+
compression : str or dict, default 'infer'
2146+
For on-the-fly decompression of on-disk data. If 'infer' and
2147+
'filepath_or_buffer' is path-like, then detect compression from the
2148+
following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
2149+
'.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
2150+
If using 'zip' or 'tar', the ZIP file must contain only one
2151+
data file to be read in. Set to ``None`` for no decompression.
2152+
Can also be a dict with key ``'method'`` set to one of
2153+
{``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
2154+
other key-value pairs are forwarded to
2155+
``zipfile.ZipFile``, ``gzip.GzipFile``,
2156+
``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
2157+
``tarfile.TarFile``, respectively.
2158+
As an example, the following could be passed for Zstandard decompression using a
2159+
custom compression dictionary:
2160+
``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
2161+
2162+
.. versionadded:: 1.5.0
2163+
Added support for `.tar` files.
2164+
storage_options : dict, optional
2165+
Extra options that make sense for a particular storage connection, e.g.
2166+
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
2167+
are forwarded to ``urllib.request.Request`` as header options. For other
2168+
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
2169+
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
2170+
details, and for more examples on storage options refer `here
2171+
<https://pandas.pydata.org/docs/user_guide/io.html?
2172+
highlight=storage_options#reading-writing-remote-files>`_.
2173+
2174+
Returns
2175+
-------
2176+
DataFrame, pandas.api.typing.StataReader
2177+
If iterator or chunksize, returns StataReader, else DataFrame.
2178+
2179+
See Also
2180+
--------
2181+
io.stata.StataReader : Low-level reader for Stata data files.
2182+
DataFrame.to_stata: Export Stata data files.
2183+
2184+
Notes
2185+
-----
2186+
Categorical variables read through an iterator may not have the same
2187+
categories and dtype. This occurs when a variable stored in a DTA
2188+
file is associated to an incomplete set of value labels that only
2189+
label a strict subset of the values.
2190+
2191+
Examples
2192+
--------
2193+
2194+
Creating a dummy stata for this example
2195+
2196+
>>> df = pd.DataFrame(
2197+
... {
2198+
... "animal": ["falcon", "parrot", "falcon", "parrot"],
2199+
... "speed": [350, 18, 361, 15],
2200+
... }
2201+
... ) # doctest: +SKIP
2202+
>>> df.to_stata("animals.dta") # doctest: +SKIP
2203+
2204+
Read a Stata dta file:
2205+
2206+
>>> df = pd.read_stata("animals.dta") # doctest: +SKIP
2207+
2208+
Read a Stata dta file in 10,000 line chunks:
2209+
2210+
>>> values = np.random.randint(
2211+
... 0, 10, size=(20_000, 1), dtype="uint8"
2212+
... ) # doctest: +SKIP
2213+
>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
2214+
>>> df.to_stata("filename.dta") # doctest: +SKIP
2215+
2216+
>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
2217+
>>> for chunk in itr:
2218+
... # Operate on a single chunk, e.g., chunk.mean()
2219+
... pass # doctest: +SKIP
2220+
"""
21542221
reader = StataReader(
21552222
filepath_or_buffer,
21562223
convert_dates=convert_dates,

0 commit comments

Comments
 (0)