pandas-dev · mroeschke · May 7, 2022 · Dec 6, 2021 · Dec 6, 2021 · Dec 6, 2021
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -196,6 +196,29 @@ representation of :class:`DataFrame` objects (:issue:`4889`).
 
 .. _whatsnew_140.enhancements.other:
 
+Reading directly from TAR archives
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+I/O methods like :function:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing
+directly on TAR archives (:issue:`44787`).
+
+.. code-block:: python
+
+   df = pd.read_csv("./movement.tar.gz")
+   # ...
+   df.to_csv("./out.tar.gz")
+
+This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives.
+The used compression method is inferred from the filename.
+If the compression method cannot be inferred, use the ``compression`` argument:
+
+.. code-block:: python
+
+   df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821
+
+(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open)
+
+
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - :meth:`concat` will preserve the ``attrs`` when it is the same for all objects and discard the ``attrs`` when they are different. (:issue:`41828`)

diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
@@ -3,6 +3,8 @@
 import bz2
 from functools import wraps
 import gzip
+import io
+import tarfile
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -387,6 +389,14 @@ def write_to_compressed(compression, path, data, dest="test"):
         mode = "w"
         args = (dest, data)
         method = "writestr"
+    elif compression == "tar":
+        compress_method = tarfile.TarFile
+        mode = "w"
+        file = tarfile.TarInfo(name=dest)
+        bytes = io.BytesIO(data)
+        file.size = len(data)
+        args = (file, bytes)
+        method = "addfile"
     elif compression == "gzip":
         compress_method = gzip.GzipFile
     elif compression == "bz2":

@@ -243,7 +243,7 @@ def closed(self) -> bool:
 # compression keywords and compression
 CompressionDict = Dict[str, Any]
 CompressionOptions = Optional[
-    Union[Literal["infer", "gzip", "bz2", "zip", "xz"], CompressionDict]
+    Union[Literal["infer", "gzip", "bz2", "zip", "xz", "tar"], CompressionDict]
 ]
 
 

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -267,15 +267,15 @@ def other_closed(request):
     return request.param
 
 
-@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"])
+@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz", "tar"])
 def compression(request):
     """
     Fixture for trying common compression types in compression tests.
     """
     return request.param
 
 
-@pytest.fixture(params=["gzip", "bz2", "zip", "xz"])
+@pytest.fixture(params=["gzip", "bz2", "zip", "xz", "tar"])
 def compression_only(request):
     """
     Fixture for trying common compression types in compression tests excluding

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3020,11 +3020,13 @@ def to_xml(
             layout of elements and attributes from original output. This
             argument requires ``lxml`` to be installed. Only XSLT 1.0
             scripts and not later versions is currently supported.
-        compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+        compression : {{'infer', 'gzip', 'bz2',
+            'zip', 'tar', 'xz', None}}, default 'infer'
             For on-the-fly decompression of on-disk data. If 'infer', then use
-            gzip, bz2, zip or xz if path_or_buffer is a string ending in
-            '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-            otherwise. If using 'zip', the ZIP file must contain only one data
+            gzip, bz2, zip, xz or tar if path_or_buffer is a string ending in
+            '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively,
+            and no decompression otherwise.
+            If using 'zip' or 'tar', the archive must contain only one data
             file to be read in. Set to None for no decompression.
         {storage_options}
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2341,6 +2341,7 @@ def to_json(
         default_handler: Callable[[Any], JSONSerializable] | None = None,
         lines: bool_t = False,
         compression: CompressionOptions = "infer",
+        mode: str = "w",
         index: bool_t = True,
         indent: int | None = None,
         storage_options: StorageOptions = None,
@@ -2604,6 +2605,7 @@ def to_json(
             default_handler=default_handler,
             lines=lines,
             compression=compression,
+            mode=mode,
             index=index,
             indent=indent,
             storage_options=storage_options,
@@ -2923,6 +2925,7 @@ def to_pickle(
         self,
         path,
         compression: CompressionOptions = "infer",
+        mode: str = "wb",
         protocol: int = pickle.HIGHEST_PROTOCOL,
         storage_options: StorageOptions = None,
     ) -> None:
@@ -2990,6 +2993,7 @@ def to_pickle(
             self,
             path,
             compression=compression,
+            mode=mode,
             protocol=protocol,
             storage_options=storage_options,
         )

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -9,6 +9,7 @@
 from io import (
     BufferedIOBase,
     BytesIO,
+    FileIO,
     RawIOBase,
     StringIO,
     TextIOBase,
@@ -17,6 +18,7 @@
 import mmap
 import os
 from pathlib import Path
+import tarfile
 from typing import (
     IO,
     Any,
@@ -260,7 +262,7 @@ def _get_filepath_or_buffer(
     ----------
     filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                          or buffer
-    compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
+    compression : {{'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, optional
     encoding : the encoding to use to decode bytes, default is 'utf-8'
     mode : str, optional
 
@@ -443,7 +445,17 @@ def file_path_to_url(path: str) -> str:
     return urljoin("file:", pathname2url(path))
 
 
-_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}
+_extension_to_compression = {
+    ".tar": "tar",
+    ".tar.gz": "tar",
+    ".tar.bz2": "tar",
+    ".tar.xz": "tar",
+    ".gz": "gzip",
+    ".bz2": "bz2",
+    ".zip": "zip",
+    ".xz": "xz",
+}
+_supported_compressions = set(_extension_to_compression.values())
 
 
 def get_compression_method(
@@ -494,9 +506,9 @@ def infer_compression(
     ----------
     filepath_or_buffer : str or file handle
         File path or object.
-    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}
         If 'infer' and `filepath_or_buffer` is path-like, then detect
-        compression from the following extensions: '.gz', '.bz2', '.zip',
+        compression from the following extensions: '.gz', '.bz2', '.zip', '.tar',
         or '.xz' (otherwise no compression).
 
     Returns
@@ -519,20 +531,18 @@ def infer_compression(
             return None
 
         # Infer compression from the filename/URL extension
-        for compression, extension in _compression_to_extension.items():
+        for extension, compression in _extension_to_compression.items():
             if filepath_or_buffer.lower().endswith(extension):
                 return compression
         return None
 
     # Compression has been specified. Check that it's valid
-    if compression in _compression_to_extension:
+    if compression in _supported_compressions:
         return compression
 
     # https://github.com/python/mypy/issues/5492
     # Unsupported operand types for + ("List[Optional[str]]" and "List[str]")
-    valid = ["infer", None] + sorted(
-        _compression_to_extension
-    )  # type: ignore[operator]
+    valid = ["infer", None] + sorted(_supported_compressions)  # type: ignore[operator]
     msg = (
         f"Unrecognized compression type: {compression}\n"
         f"Valid compression types are {valid}"
@@ -677,7 +687,7 @@ def get_handle(
         ioargs.encoding,
         ioargs.mode,
         errors,
-        ioargs.compression["method"] not in _compression_to_extension,
+        ioargs.compression["method"] not in _supported_compressions,
     )
 
     is_path = isinstance(handle, str)
@@ -745,6 +755,25 @@ def get_handle(
                         f"Only one file per ZIP: {zip_names}"
                     )
 
+        # TAR Encoding
+        elif compression == "tar":
+            if is_path:
+                handle = _BytesTarFile.open(name=handle, mode=ioargs.mode)
+            else:
+                handle = _BytesTarFile.open(fileobj=handle, mode=ioargs.mode)
+            if handle.mode == "r":
+                handles.append(handle)
+                files = handle.getnames()
+                if len(files) == 1:
+                    handle = handle.extractfile(files[0])
+                elif len(files) == 0:
+                    raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
+                else:
+                    raise ValueError(
+                        "Multiple files found in TAR archive. "
+                        f"Only one file per TAR archive: {files}"
+                    )
+
         # XZ Compression
         elif compression == "xz":
             handle = get_lzma_file()(handle, ioargs.mode)
@@ -823,6 +852,96 @@ def get_handle(
     )
 
 
+class _BytesTarFile(tarfile.TarFile, BytesIO):
+
+    # GH 17778
+    def __init__(
+        self,
+        name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
+        mode: str,
+        fileobj: FileIO,
+        archive_name: str | None = None,
+        **kwargs,
+    ):
+        self.archive_name = archive_name
+        self.multiple_write_buffer: StringIO | BytesIO | None = None
+        self._closing = False
+
+        super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs)
+
+    @classmethod
+    def open(cls, name=None, mode="r", **kwargs):
+        mode = mode.replace("b", "")
+        return super().open(name=name, mode=cls.extend_mode(name, mode), **kwargs)
+
+    @classmethod
+    def extend_mode(
+        cls, name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], mode: str
+    ) -> str:
+        if mode != "w":
+            return mode
+        if isinstance(name, (os.PathLike, str)):
+            filename = Path(name)
+            if filename.suffix == ".gz":
+                return mode + ":gz"
+            elif filename.suffix == ".xz":
+                return mode + ":xz"
+            elif filename.suffix == ".bz2":
+                return mode + ":bz2"
+        return mode
+
+    def infer_filename(self):
+        """
+        If an explicit archive_name is not given, we still want the file inside the zip
+        file not to be named something.tar, because that causes confusion (GH39465).
+        """
+        if isinstance(self.name, (os.PathLike, str)):
+            filename = Path(self.name)
+            if filename.suffix == ".tar":
+                return filename.with_suffix("").name
+            if filename.suffix in [".tar.gz", ".tar.bz2", ".tar.xz"]:
+                return filename.with_suffix("").with_suffix("").name
+            return filename.name
+        return None
+
+    def write(self, data):
+        # buffer multiple write calls, write on flush
+        if self.multiple_write_buffer is None:
+            self.multiple_write_buffer = (
+                BytesIO() if isinstance(data, bytes) else StringIO()
+            )
+        self.multiple_write_buffer.write(data)
+
+    def flush(self) -> None:
+        # write to actual handle and close write buffer
+        if self.multiple_write_buffer is None or self.multiple_write_buffer.closed:
+            return
+
+        # TarFile needs a non-empty string
+        archive_name = self.archive_name or self.infer_filename() or "tar"
+        with self.multiple_write_buffer:
+            value = self.multiple_write_buffer.getvalue()
+            tarinfo = tarfile.TarInfo(name=archive_name)
+            tarinfo.size = len(value)
+            self.addfile(tarinfo, BytesIO(value))
+
+    def close(self):
+        self.flush()
+        super().close()
+
+    @property
+    def closed(self):
+        if self.multiple_write_buffer is None:
+            return False
+        return self.multiple_write_buffer.closed and super().closed
+
+    @closed.setter
+    def closed(self, value):
+        if not self._closing and value:
+            self._closing = True
+            self.close()
+
+
 # error: Definition of "__exit__" in base class "ZipFile" is incompatible with
 # definition in base class "BytesIO"  [misc]
 # error: Definition of "__enter__" in base class "ZipFile" is incompatible with

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -83,6 +83,7 @@ def to_json(
     default_handler: Callable[[Any], JSONSerializable] | None = None,
     lines: bool = False,
     compression: CompressionOptions = "infer",
+    mode: str = "w",
     index: bool = True,
     indent: int = 0,
     storage_options: StorageOptions = None,
@@ -127,7 +128,7 @@ def to_json(
     if path_or_buf is not None:
         # apply compression and byte/text conversion
         with get_handle(
-            path_or_buf, "w", compression=compression, storage_options=storage_options
+            path_or_buf, mode, compression=compression, storage_options=storage_options
         ) as handles:
             handles.handle.write(s)
     else:
@@ -475,11 +476,12 @@ def read_json(
 
            ``JsonReader`` is a context manager.
 
-    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
-        gzip, bz2, zip or xz if path_or_buf is a string ending in
-        '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-        otherwise. If using 'zip', the ZIP file must contain only one data
+        gzip, bz2, zip, xz or tar if path_or_buf is a string ending in
+        '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively,
+        and no decompression otherwise.
+        If using 'zip' or 'tar', the archive must contain only one data
         file to be read in. Set to None for no decompression.
 
     nrows : int, optional

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -280,11 +280,11 @@
     .. versionchanged:: 1.2
 
        ``TextFileReader`` is a context manager.
-compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
     For on-the-fly decompression of on-disk data. If 'infer' and
     `filepath_or_buffer` is path-like, then detect compression from the
-    following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
-    decompression). If using 'zip', the ZIP file must contain only one data
+    following extensions: '.gz', '.bz2', '.zip', '.tar', '.xz' (otherwise no
+    decompression). If using 'zip' or 'tar', the archive must contain only one data
     file to be read in. Set to None for no decompression.
 thousands : str, optional
     Thousands separator.