44
44
ValueLabelTypeMismatch ,
45
45
)
46
46
from pandas .util ._decorators import (
47
- Appender ,
48
47
doc ,
49
48
set_module ,
50
49
)
127
126
Return StataReader object for iterations, returns chunks with
128
127
given number of lines."""
129
128
130
- _iterator_params = """\
131
- iterator : bool, default False
132
- Return StataReader object."""
133
-
134
129
_reader_notes = """\
135
130
Notes
136
131
-----
139
134
file is associated to an incomplete set of value labels that only
140
135
label a strict subset of the values."""
141
136
142
- _read_stata_doc = f"""
143
- Read Stata file into DataFrame.
144
-
145
- Parameters
146
- ----------
147
- filepath_or_buffer : str, path object or file-like object
148
- Any valid string path is acceptable. The string could be a URL. Valid
149
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
150
- expected. A local file could be: ``file://localhost/path/to/table.dta``.
151
-
152
- If you want to pass in a path object, pandas accepts any ``os.PathLike``.
153
-
154
- By file-like object, we refer to objects with a ``read()`` method,
155
- such as a file handle (e.g. via builtin ``open`` function)
156
- or ``StringIO``.
157
- { _statafile_processing_params1 }
158
- { _statafile_processing_params2 }
159
- { _chunksize_params }
160
- { _iterator_params }
161
- { _shared_docs ["decompression_options" ] % "filepath_or_buffer" }
162
- { _shared_docs ["storage_options" ]}
163
-
164
- Returns
165
- -------
166
- DataFrame, pandas.api.typing.StataReader
167
- If iterator or chunksize, returns StataReader, else DataFrame.
168
-
169
- See Also
170
- --------
171
- io.stata.StataReader : Low-level reader for Stata data files.
172
- DataFrame.to_stata: Export Stata data files.
173
-
174
- { _reader_notes }
175
-
176
- Examples
177
- --------
178
-
179
- Creating a dummy stata for this example
180
-
181
- >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'],
182
- ... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP
183
- >>> df.to_stata('animals.dta') # doctest: +SKIP
184
-
185
- Read a Stata dta file:
186
-
187
- >>> df = pd.read_stata('animals.dta') # doctest: +SKIP
188
-
189
- Read a Stata dta file in 10,000 line chunks:
190
-
191
- >>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP
192
- >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
193
- >>> df.to_stata('filename.dta') # doctest: +SKIP
194
-
195
- >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
196
- >>> for chunk in itr:
197
- ... # Operate on a single chunk, e.g., chunk.mean()
198
- ... pass # doctest: +SKIP
199
- """
200
-
201
- _read_method_doc = f"""\
202
- Reads observations from Stata file, converting them into a dataframe
203
-
204
- Parameters
205
- ----------
206
- nrows : int
207
- Number of lines to read from data file, if None read whole file.
208
- { _statafile_processing_params1 }
209
- { _statafile_processing_params2 }
210
-
211
- Returns
212
- -------
213
- DataFrame
214
- """
215
-
216
137
_stata_reader_doc = f"""\
217
138
Class for reading Stata dta files.
218
139
@@ -1677,7 +1598,6 @@ def get_chunk(self, size: int | None = None) -> DataFrame:
1677
1598
size = self ._chunksize
1678
1599
return self .read (nrows = size )
1679
1600
1680
- @Appender (_read_method_doc )
1681
1601
def read (
1682
1602
self ,
1683
1603
nrows : int | None = None ,
@@ -1689,6 +1609,38 @@ def read(
1689
1609
columns : Sequence [str ] | None = None ,
1690
1610
order_categoricals : bool | None = None ,
1691
1611
) -> DataFrame :
1612
+ """
1613
+ Reads observations from Stata file, converting them into a dataframe
1614
+
1615
+ Parameters
1616
+ ----------
1617
+ nrows : int
1618
+ Number of lines to read from data file, if None read whole file.
1619
+ convert_dates : bool, default True
1620
+ Convert date variables to DataFrame time values.
1621
+ convert_categoricals : bool, default True
1622
+ Read value labels and convert columns to Categorical/Factor variables.
1623
+ index_col : str, optional
1624
+ Column to set as index.
1625
+ convert_missing : bool, default False
1626
+ Flag indicating whether to convert missing values to their Stata
1627
+ representations. If False, missing values are replaced with nan.
1628
+ If True, columns containing missing values are returned with
1629
+ object data types and missing values are represented by
1630
+ StataMissingValue objects.
1631
+ preserve_dtypes : bool, default True
1632
+ Preserve Stata datatypes. If False, numeric data are upcast to pandas
1633
+ default types for foreign data (float64 or int64).
1634
+ columns : list or None
1635
+ Columns to retain. Columns will be returned in the given order. None
1636
+ returns all columns.
1637
+ order_categoricals : bool, default True
1638
+ Flag indicating whether converted categorical data are ordered.
1639
+
1640
+ Returns
1641
+ -------
1642
+ DataFrame
1643
+ """
1692
1644
self ._ensure_open ()
1693
1645
1694
1646
# Handle options
@@ -2135,7 +2087,6 @@ def value_labels(self) -> dict[str, dict[int, str]]:
2135
2087
2136
2088
2137
2089
@set_module ("pandas" )
2138
- @Appender (_read_stata_doc )
2139
2090
def read_stata (
2140
2091
filepath_or_buffer : FilePath | ReadBuffer [bytes ],
2141
2092
* ,
@@ -2151,6 +2102,122 @@ def read_stata(
2151
2102
compression : CompressionOptions = "infer" ,
2152
2103
storage_options : StorageOptions | None = None ,
2153
2104
) -> DataFrame | StataReader :
2105
+ """
2106
+ Read Stata file into DataFrame.
2107
+
2108
+ Parameters
2109
+ ----------
2110
+ filepath_or_buffer : str, path object or file-like object
2111
+ Any valid string path is acceptable. The string could be a URL. Valid
2112
+ URL schemes include http, ftp, s3, and file. For file URLs, a host is
2113
+ expected. A local file could be: ``file://localhost/path/to/table.dta``.
2114
+
2115
+ If you want to pass in a path object, pandas accepts any ``os.PathLike``.
2116
+
2117
+ By file-like object, we refer to objects with a ``read()`` method,
2118
+ such as a file handle (e.g. via builtin ``open`` function)
2119
+ or ``StringIO``.
2120
+ convert_dates : bool, default True
2121
+ Convert date variables to DataFrame time values.
2122
+ convert_categoricals : bool, default True
2123
+ Read value labels and convert columns to Categorical/Factor variables.
2124
+ index_col : str, optional
2125
+ Column to set as index.
2126
+ convert_missing : bool, default False
2127
+ Flag indicating whether to convert missing values to their Stata
2128
+ representations. If False, missing values are replaced with nan.
2129
+ If True, columns containing missing values are returned with
2130
+ object data types and missing values are represented by
2131
+ StataMissingValue objects.
2132
+ preserve_dtypes : bool, default True
2133
+ Preserve Stata datatypes. If False, numeric data are upcast to pandas
2134
+ default types for foreign data (float64 or int64).
2135
+ columns : list or None
2136
+ Columns to retain. Columns will be returned in the given order. None
2137
+ returns all columns.
2138
+ order_categoricals : bool, default True
2139
+ Flag indicating whether converted categorical data are ordered.
2140
+ chunksize : int, default None
2141
+ Return StataReader object for iterations, returns chunks with
2142
+ given number of lines.
2143
+ iterator : bool, default False
2144
+ Return StataReader object.
2145
+ compression : str or dict, default 'infer'
2146
+ For on-the-fly decompression of on-disk data. If 'infer' and
2147
+ 'filepath_or_buffer' is path-like, then detect compression from the
2148
+ following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
2149
+ '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
2150
+ If using 'zip' or 'tar', the ZIP file must contain only one
2151
+ data file to be read in. Set to ``None`` for no decompression.
2152
+ Can also be a dict with key ``'method'`` set to one of
2153
+ {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
2154
+ other key-value pairs are forwarded to
2155
+ ``zipfile.ZipFile``, ``gzip.GzipFile``,
2156
+ ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
2157
+ ``tarfile.TarFile``, respectively.
2158
+ As an example, the following could be passed for Zstandard decompression using a
2159
+ custom compression dictionary:
2160
+ ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
2161
+
2162
+ .. versionadded:: 1.5.0
2163
+ Added support for `.tar` files.
2164
+ storage_options : dict, optional
2165
+ Extra options that make sense for a particular storage connection, e.g.
2166
+ host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
2167
+ are forwarded to ``urllib.request.Request`` as header options. For other
2168
+ URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
2169
+ forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
2170
+ details, and for more examples on storage options refer `here
2171
+ <https://pandas.pydata.org/docs/user_guide/io.html?
2172
+ highlight=storage_options#reading-writing-remote-files>`_.
2173
+
2174
+ Returns
2175
+ -------
2176
+ DataFrame, pandas.api.typing.StataReader
2177
+ If iterator or chunksize, returns StataReader, else DataFrame.
2178
+
2179
+ See Also
2180
+ --------
2181
+ io.stata.StataReader : Low-level reader for Stata data files.
2182
+ DataFrame.to_stata: Export Stata data files.
2183
+
2184
+ Notes
2185
+ -----
2186
+ Categorical variables read through an iterator may not have the same
2187
+ categories and dtype. This occurs when a variable stored in a DTA
2188
+ file is associated to an incomplete set of value labels that only
2189
+ label a strict subset of the values.
2190
+
2191
+ Examples
2192
+ --------
2193
+
2194
+ Creating a dummy stata for this example
2195
+
2196
+ >>> df = pd.DataFrame(
2197
+ ... {
2198
+ ... "animal": ["falcon", "parrot", "falcon", "parrot"],
2199
+ ... "speed": [350, 18, 361, 15],
2200
+ ... }
2201
+ ... ) # doctest: +SKIP
2202
+ >>> df.to_stata("animals.dta") # doctest: +SKIP
2203
+
2204
+ Read a Stata dta file:
2205
+
2206
+ >>> df = pd.read_stata("animals.dta") # doctest: +SKIP
2207
+
2208
+ Read a Stata dta file in 10,000 line chunks:
2209
+
2210
+ >>> values = np.random.randint(
2211
+ ... 0, 10, size=(20_000, 1), dtype="uint8"
2212
+ ... ) # doctest: +SKIP
2213
+ >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
2214
+ >>> df.to_stata("filename.dta") # doctest: +SKIP
2215
+
2216
+ >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
2217
+ >>> for chunk in itr:
2218
+ ... # Operate on a single chunk, e.g., chunk.mean()
2219
+ ... pass # doctest: +SKIP
2220
+ """
2154
2221
reader = StataReader (
2155
2222
filepath_or_buffer ,
2156
2223
convert_dates = convert_dates ,
0 commit comments