-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
Replace appender in stata #62621
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Replace appender in stata #62621
Changes from 4 commits
f797864
42d066a
7d66f64
57c530f
bb7d34a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,7 +44,6 @@ | |
ValueLabelTypeMismatch, | ||
) | ||
from pandas.util._decorators import ( | ||
Appender, | ||
doc, | ||
set_module, | ||
) | ||
|
@@ -1677,7 +1676,6 @@ def get_chunk(self, size: int | None = None) -> DataFrame: | |
size = self._chunksize | ||
return self.read(nrows=size) | ||
|
||
@Appender(_read_method_doc) | ||
def read( | ||
self, | ||
nrows: int | None = None, | ||
|
@@ -1689,6 +1687,38 @@ def read( | |
columns: Sequence[str] | None = None, | ||
order_categoricals: bool | None = None, | ||
) -> DataFrame: | ||
""" | ||
Reads observations from Stata file, converting them into a dataframe | ||
|
||
Parameters | ||
---------- | ||
nrows : int | ||
Number of lines to read from data file, if None read whole file. | ||
convert_dates : bool, default True | ||
Convert date variables to DataFrame time values. | ||
convert_categoricals : bool, default True | ||
Read value labels and convert columns to Categorical/Factor variables. | ||
index_col : str, optional | ||
Column to set as index. | ||
convert_missing : bool, default False | ||
Flag indicating whether to convert missing values to their Stata | ||
representations. If False, missing values are replaced with nan. | ||
If True, columns containing missing values are returned with | ||
object data types and missing values are represented by | ||
StataMissingValue objects. | ||
preserve_dtypes : bool, default True | ||
Preserve Stata datatypes. If False, numeric data are upcast to pandas | ||
default types for foreign data (float64 or int64). | ||
columns : list or None | ||
Columns to retain. Columns will be returned in the given order. None | ||
returns all columns. | ||
order_categoricals : bool, default True | ||
Flag indicating whether converted categorical data are ordered. | ||
|
||
Returns | ||
------- | ||
DataFrame | ||
""" | ||
self._ensure_open() | ||
|
||
# Handle options | ||
|
@@ -2135,7 +2165,6 @@ def value_labels(self) -> dict[str, dict[int, str]]: | |
|
||
|
||
@set_module("pandas") | ||
@Appender(_read_stata_doc) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi Matthew, yes the |
||
def read_stata( | ||
filepath_or_buffer: FilePath | ReadBuffer[bytes], | ||
*, | ||
|
@@ -2151,6 +2180,122 @@ def read_stata( | |
compression: CompressionOptions = "infer", | ||
storage_options: StorageOptions | None = None, | ||
) -> DataFrame | StataReader: | ||
""" | ||
Read Stata file into DataFrame. | ||
|
||
Parameters | ||
---------- | ||
filepath_or_buffer : str, path object or file-like object | ||
Any valid string path is acceptable. The string could be a URL. Valid | ||
URL schemes include http, ftp, s3, and file. For file URLs, a host is | ||
expected. A local file could be: ``file://localhost/path/to/table.dta``. | ||
|
||
If you want to pass in a path object, pandas accepts any ``os.PathLike``. | ||
|
||
By file-like object, we refer to objects with a ``read()`` method, | ||
such as a file handle (e.g. via builtin ``open`` function) | ||
or ``StringIO``. | ||
convert_dates : bool, default True | ||
Convert date variables to DataFrame time values. | ||
convert_categoricals : bool, default True | ||
Read value labels and convert columns to Categorical/Factor variables. | ||
index_col : str, optional | ||
Column to set as index. | ||
convert_missing : bool, default False | ||
Flag indicating whether to convert missing values to their Stata | ||
representations. If False, missing values are replaced with nan. | ||
If True, columns containing missing values are returned with | ||
object data types and missing values are represented by | ||
StataMissingValue objects. | ||
preserve_dtypes : bool, default True | ||
Preserve Stata datatypes. If False, numeric data are upcast to pandas | ||
default types for foreign data (float64 or int64). | ||
columns : list or None | ||
Columns to retain. Columns will be returned in the given order. None | ||
returns all columns. | ||
order_categoricals : bool, default True | ||
Flag indicating whether converted categorical data are ordered. | ||
chunksize : int, default None | ||
Return StataReader object for iterations, returns chunks with | ||
given number of lines. | ||
iterator : bool, default False | ||
Return StataReader object. | ||
compression : str or dict, default 'infer' | ||
For on-the-fly decompression of on-disk data. If 'infer' and | ||
'filepath_or_buffer' is path-like, then detect compression from the | ||
following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', | ||
'.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). | ||
If using 'zip' or 'tar', the ZIP file must contain only one | ||
data file to be read in. Set to ``None`` for no decompression. | ||
Can also be a dict with key ``'method'`` set to one of | ||
{``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and | ||
other key-value pairs are forwarded to | ||
``zipfile.ZipFile``, ``gzip.GzipFile``, | ||
``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or | ||
``tarfile.TarFile``, respectively. | ||
As an example, the following could be passed for Zstandard decompression using a | ||
custom compression dictionary: | ||
``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. | ||
|
||
.. versionadded:: 1.5.0 | ||
Added support for `.tar` files. | ||
storage_options : dict, optional | ||
Extra options that make sense for a particular storage connection, e.g. | ||
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs | ||
are forwarded to ``urllib.request.Request`` as header options. For other | ||
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are | ||
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more | ||
details, and for more examples on storage options refer `here | ||
<https://pandas.pydata.org/docs/user_guide/io.html? | ||
highlight=storage_options#reading-writing-remote-files>`_. | ||
|
||
Returns | ||
------- | ||
DataFrame, pandas.api.typing.StataReader | ||
If iterator or chunksize, returns StataReader, else DataFrame. | ||
|
||
See Also | ||
-------- | ||
io.stata.StataReader : Low-level reader for Stata data files. | ||
DataFrame.to_stata: Export Stata data files. | ||
|
||
Notes | ||
----- | ||
Categorical variables read through an iterator may not have the same | ||
categories and dtype. This occurs when a variable stored in a DTA | ||
file is associated to an incomplete set of value labels that only | ||
label a strict subset of the values. | ||
|
||
Examples | ||
-------- | ||
|
||
Creating a dummy stata for this example | ||
|
||
>>> df = pd.DataFrame( | ||
... { | ||
... "animal": ["falcon", "parrot", "falcon", "parrot"], | ||
... "speed": [350, 18, 361, 15], | ||
... } | ||
... ) # doctest: +SKIP | ||
>>> df.to_stata("animals.dta") # doctest: +SKIP | ||
|
||
Read a Stata dta file: | ||
|
||
>>> df = pd.read_stata("animals.dta") # doctest: +SKIP | ||
|
||
Read a Stata dta file in 10,000 line chunks: | ||
|
||
>>> values = np.random.randint( | ||
... 0, 10, size=(20_000, 1), dtype="uint8" | ||
... ) # doctest: +SKIP | ||
>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP | ||
>>> df.to_stata("filename.dta") # doctest: +SKIP | ||
|
||
>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP | ||
>>> for chunk in itr: | ||
... # Operate on a single chunk, e.g., chunk.mean() | ||
... pass # doctest: +SKIP | ||
""" | ||
reader = StataReader( | ||
filepath_or_buffer, | ||
convert_dates=convert_dates, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can
_read_method_doc
(and any associated variables) be removed after this change?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Matthew, yes the
_read_method_doc
can be removed. Should I remove it in this PR?