pandas-dev · mroeschke · May 7, 2022 · Dec 6, 2021 · Dec 6, 2021 · Dec 6, 2021
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3020,11 +3020,13 @@ def to_xml(
             layout of elements and attributes from original output. This
             argument requires ``lxml`` to be installed. Only XSLT 1.0
             scripts and not later versions is currently supported.
-        compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+        compression : {{'infer', 'gzip', 'bz2',
+            'zip', 'tar', 'xz', None}}, default 'infer'
             For on-the-fly decompression of on-disk data. If 'infer', then use
-            gzip, bz2, zip or xz if path_or_buffer is a string ending in
-            '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-            otherwise. If using 'zip', the ZIP file must contain only one data
+            gzip, bz2, zip, xz or tar if path_or_buffer is a string ending in
+            '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively,
+            and no decompression otherwise.
+            If using 'zip' or 'tar', the archive must contain only one data
             file to be read in. Set to None for no decompression.
         {storage_options}
 

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -18,6 +18,7 @@
 import mmap
 import os
 from pathlib import Path
+import tarfile
 import tempfile
 from typing import (
     IO,
@@ -262,7 +263,7 @@ def _get_filepath_or_buffer(
     ----------
     filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                          or buffer
-    compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
+    compression : {{'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, optional
     encoding : the encoding to use to decode bytes, default is 'utf-8'
     mode : str, optional
 
@@ -496,9 +497,9 @@ def infer_compression(
     ----------
     filepath_or_buffer : str or file handle
         File path or object.
-    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}
         If 'infer' and `filepath_or_buffer` is path-like, then detect
-        compression from the following extensions: '.gz', '.bz2', '.zip',
+        compression from the following extensions: '.gz', '.bz2', '.zip', '.tar',
         or '.xz' (otherwise no compression).
 
     Returns
@@ -520,6 +521,9 @@ def infer_compression(
             # Cannot infer compression of a buffer, assume no compression
             return None
 
+        if ".tar" in filepath_or_buffer:
+            return "tar"
+
         # Infer compression from the filename/URL extension
         for compression, extension in _compression_to_extension.items():
             if filepath_or_buffer.lower().endswith(extension):
@@ -747,6 +751,21 @@ def get_handle(
                         f"Only one file per ZIP: {zip_names}"
                     )
 
+        # TAR Encoding
+        elif compression == "tar":
+            tar = tarfile.open(handle, "r:*")
+            handles.append(tar)
+            files = tar.getnames()
+            if len(files) == 1:
+                handle = tar.extractfile(files[0])
+            elif len(files) == 0:
+                raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
+            else:
+                raise ValueError(
+                    "Multiple files found in TAR archive. "
+                    f"Only one file per TAR archive: {files}"
+                )
+
         # XZ Compression
         elif compression == "xz":
             handle = get_lzma_file()(handle, ioargs.mode)

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -475,11 +475,12 @@ def read_json(
 
            ``JsonReader`` is a context manager.
 
-    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
-        gzip, bz2, zip or xz if path_or_buf is a string ending in
-        '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-        otherwise. If using 'zip', the ZIP file must contain only one data
+        gzip, bz2, zip, xz or tar if path_or_buf is a string ending in
+        '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively,
+        and no decompression otherwise.
+        If using 'zip' or 'tar', the archive must contain only one data
         file to be read in. Set to None for no decompression.
 
     nrows : int, optional

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -279,11 +279,11 @@
     .. versionchanged:: 1.2
 
        ``TextFileReader`` is a context manager.
-compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
     For on-the-fly decompression of on-disk data. If 'infer' and
     `filepath_or_buffer` is path-like, then detect compression from the
-    following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
-    decompression). If using 'zip', the ZIP file must contain only one data
+    following extensions: '.gz', '.bz2', '.zip', '.tar', '.xz' (otherwise no
+    decompression). If using 'zip' or 'tar', the archive must contain only one data
     file to be read in. Set to None for no decompression.
 thousands : str, optional
     Thousands separator.

diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -134,9 +134,9 @@ def read_pickle(
         .. versionchanged:: 1.0.0
            Accept URL. URL is not limited to S3 and GCS.
 
-    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         If 'infer' and 'path_or_url' is path-like, then detect compression from
-        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
+        the following extensions: '.gz', '.bz2', '.zip', '.tar', or '.xz' (otherwise no
         compression) If 'infer' and 'path_or_url' is not path-like, then use
         None (= no decompression).
 

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -68,9 +68,9 @@ class _XMLFrameParser:
         URL, file, file-like object, or a raw string containing XSLT,
         `etree` does not support XSLT but retained for consistency.
 
-    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'tar', 'xz', None}, default 'infer'
         Compression type for on-the-fly decompression of on-disk data.
-        If 'infer', then use extension for gzip, bz2, zip or xz.
+        If 'infer', then use extension for gzip, bz2, zip, tar or xz.
 
     storage_options : dict, optional
         Extra options that make sense for a particular storage connection,
@@ -801,11 +801,12 @@ def read_xml(
         transformation and not the original XML document. Only XSLT 1.0
         scripts and not later versions is currently supported.
 
-    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
-        gzip, bz2, zip or xz if path_or_buffer is a string ending in
-        '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-        otherwise. If using 'zip', the ZIP file must contain only one data
+        gzip, bz2, zip, xz, or tar if path_or_buffer is a string ending in
+        '.gz', '.bz2', '.zip', '.xz', or containing '.tar' respectively,
+        and no decompression otherwise.
+        If using 'zip' or 'tar', the archive must contain only one data
         file to be read in. Set to None for no decompression.
 
     {storage_options}

diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
@@ -162,6 +162,14 @@ def test_invalid_compression(all_parsers, invalid_compression):
         parser.read_csv("test_file.zip", **compress_kwargs)
 
 
+@skip_pyarrow
+def test_compression_tar_archive(all_parsers, csv_dir_path):
+    parser = all_parsers
+    path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
+    df = parser.read_csv(path)
+    assert list(df.columns) == ["a"]
+
+
 def test_ignore_compression_extension(all_parsers):
     parser = all_parsers
     df = DataFrame({"a": [0, 1]})