@@ -1688,6 +1688,38 @@ def read(
1688
1688
columns : Sequence [str ] | None = None ,
1689
1689
order_categoricals : bool | None = None ,
1690
1690
) -> DataFrame :
1691
+ """
1692
+ Reads observations from Stata file, converting them into a dataframe
1693
+
1694
+ Parameters
1695
+ ----------
1696
+ nrows : int
1697
+ Number of lines to read from data file, if None read whole file.
1698
+ convert_dates : bool, default True
1699
+ Convert date variables to DataFrame time values.
1700
+ convert_categoricals : bool, default True
1701
+ Read value labels and convert columns to Categorical/Factor variables.
1702
+ index_col : str, optional
1703
+ Column to set as index.
1704
+ convert_missing : bool, default False
1705
+ Flag indicating whether to convert missing values to their Stata
1706
+ representations. If False, missing values are replaced with nan.
1707
+ If True, columns containing missing values are returned with
1708
+ object data types and missing values are represented by
1709
+ StataMissingValue objects.
1710
+ preserve_dtypes : bool, default True
1711
+ Preserve Stata datatypes. If False, numeric data are upcast to pandas
1712
+ default types for foreign data (float64 or int64).
1713
+ columns : list or None
1714
+ Columns to retain. Columns will be returned in the given order. None
1715
+ returns all columns.
1716
+ order_categoricals : bool, default True
1717
+ Flag indicating whether converted categorical data are ordered.
1718
+
1719
+ Returns
1720
+ -------
1721
+ DataFrame
1722
+ """
1691
1723
self ._ensure_open ()
1692
1724
1693
1725
# Handle options
@@ -2149,6 +2181,116 @@ def read_stata(
2149
2181
compression : CompressionOptions = "infer" ,
2150
2182
storage_options : StorageOptions | None = None ,
2151
2183
) -> DataFrame | StataReader :
2184
+ """
2185
+ Read Stata file into DataFrame.
2186
+
2187
+ Parameters
2188
+ ----------
2189
+ filepath_or_buffer : str, path object or file-like object
2190
+ Any valid string path is acceptable. The string could be a URL. Valid
2191
+ URL schemes include http, ftp, s3, and file. For file URLs, a host is
2192
+ expected. A local file could be: ``file://localhost/path/to/table.dta``.
2193
+
2194
+ If you want to pass in a path object, pandas accepts any ``os.PathLike``.
2195
+
2196
+ By file-like object, we refer to objects with a ``read()`` method,
2197
+ such as a file handle (e.g. via builtin ``open`` function)
2198
+ or ``StringIO``.
2199
+ convert_dates : bool, default True
2200
+ Convert date variables to DataFrame time values.
2201
+ convert_categoricals : bool, default True
2202
+ Read value labels and convert columns to Categorical/Factor variables.
2203
+ index_col : str, optional
2204
+ Column to set as index.
2205
+ convert_missing : bool, default False
2206
+ Flag indicating whether to convert missing values to their Stata
2207
+ representations. If False, missing values are replaced with nan.
2208
+ If True, columns containing missing values are returned with
2209
+ object data types and missing values are represented by
2210
+ StataMissingValue objects.
2211
+ preserve_dtypes : bool, default True
2212
+ Preserve Stata datatypes. If False, numeric data are upcast to pandas
2213
+ default types for foreign data (float64 or int64).
2214
+ columns : list or None
2215
+ Columns to retain. Columns will be returned in the given order. None
2216
+ returns all columns.
2217
+ order_categoricals : bool, default True
2218
+ Flag indicating whether converted categorical data are ordered.
2219
+ chunksize : int, default None
2220
+ Return StataReader object for iterations, returns chunks with
2221
+ given number of lines.
2222
+ iterator : bool, default False
2223
+ Return StataReader object.
2224
+ compression : str or dict, default 'infer'
2225
+ For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is
2226
+ path-like, then detect compression from the following extensions: '.gz',
2227
+ '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
2228
+ (otherwise no compression).
2229
+ If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in.
2230
+ Set to ``None`` for no decompression.
2231
+ Can also be a dict with key ``'method'`` set
2232
+ to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
2233
+ other key-value pairs are forwarded to
2234
+ ``zipfile.ZipFile``, ``gzip.GzipFile``,
2235
+ ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
2236
+ ``tarfile.TarFile``, respectively.
2237
+ As an example, the following could be passed for Zstandard decompression using a
2238
+ custom compression dictionary:
2239
+ ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
2240
+
2241
+ .. versionadded:: 1.5.0
2242
+ Added support for `.tar` files.
2243
+ storage_options : dict, optional
2244
+ Extra options that make sense for a particular storage connection, e.g.
2245
+ host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
2246
+ are forwarded to ``urllib.request.Request`` as header options. For other
2247
+ URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
2248
+ forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
2249
+ details, and for more examples on storage options refer `here
2250
+ <https://pandas.pydata.org/docs/user_guide/io.html?
2251
+ highlight=storage_options#reading-writing-remote-files>`_.
2252
+
2253
+ Returns
2254
+ -------
2255
+ DataFrame, pandas.api.typing.StataReader
2256
+ If iterator or chunksize, returns StataReader, else DataFrame.
2257
+
2258
+ See Also
2259
+ --------
2260
+ io.stata.StataReader : Low-level reader for Stata data files.
2261
+ DataFrame.to_stata: Export Stata data files.
2262
+
2263
+ Notes
2264
+ -----
2265
+ Categorical variables read through an iterator may not have the same
2266
+ categories and dtype. This occurs when a variable stored in a DTA
2267
+ file is associated to an incomplete set of value labels that only
2268
+ label a strict subset of the values.
2269
+
2270
+ Examples
2271
+ --------
2272
+
2273
+ Creating a dummy stata for this example
2274
+
2275
+ >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', 'parrot'],
2276
+ ... 'speed': [350, 18, 361, 15]}) # doctest: +SKIP
2277
+ >>> df.to_stata('animals.dta') # doctest: +SKIP
2278
+
2279
+ Read a Stata dta file:
2280
+
2281
+ >>> df = pd.read_stata('animals.dta') # doctest: +SKIP
2282
+
2283
+ Read a Stata dta file in 10,000 line chunks:
2284
+
2285
+ >>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP
2286
+ >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
2287
+ >>> df.to_stata('filename.dta') # doctest: +SKIP
2288
+
2289
+ >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
2290
+ >>> for chunk in itr:
2291
+ ... # Operate on a single chunk, e.g., chunk.mean()
2292
+ ... pass # doctest: +SKIP
2293
+ """
2152
2294
reader = StataReader (
2153
2295
filepath_or_buffer ,
2154
2296
convert_dates = convert_dates ,
0 commit comments