Skip to content

Commit 7f971c4

Browse files
ENH: Add read archive function
1 parent 5ccd341 commit 7f971c4

File tree

2 files changed

+263
-0
lines changed

2 files changed

+263
-0
lines changed

janitor/io.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import inspect
44
import os
55
import subprocess
6+
import tarfile
67
import warnings
8+
import zipfile
79
from collections import defaultdict
810
from glob import glob
911
from io import StringIO
@@ -689,3 +691,187 @@ def _object_to_dict(obj):
689691
data[key] = _object_to_dict(value)
690692
return data
691693
return obj
694+
695+
696+
#################################################################
697+
698+
699+
def read_archive(
700+
file_path: str,
701+
extract_to_df: bool = True,
702+
file_type: str | None = None,
703+
selected_files: list[str] | None = None,
704+
) -> pd.DataFrame | list[str]:
705+
"""
706+
Reads an archive file (.zip, .tar, .tar.gz) and optionally lists its content
707+
or extracts specific files into a DataFrame.
708+
709+
Args:
710+
file_path: The path to the archive file.
711+
extract_to_df: Whether to read the contents into a DataFrame
712+
(for CSV or similar formats). Default is True.
713+
file_type: Optional file type hint ('zip', 'tar', 'tar.gz').
714+
If None, it will be inferred from the file extension.
715+
selected_files: List of files to read directly without user interaction.
716+
717+
Returns:
718+
- A pandas DataFrame if extract_to_df is True
719+
and the user selects a file to load.
720+
- A list of dataframes that contains
721+
compatible file names in the archive otherwise.
722+
"""
723+
file_type = file_type or _infer_file_type(file_path)
724+
725+
if file_type == "zip":
726+
return _process_zip_archive(file_path, extract_to_df, selected_files)
727+
elif file_type in {"tar", "tar.gz"}:
728+
return _process_tar_archive(file_path, extract_to_df, selected_files)
729+
else:
730+
raise ValueError(
731+
"Unsupported archive format. Supported formats are .zip, .tar, or .tar.gz."
732+
)
733+
734+
735+
def _process_zip_archive(
736+
file_path: str, extract_to_df: bool, selected_files: list[str] | None
737+
) -> pd.DataFrame | list[str]:
738+
"""Process a ZIP archive."""
739+
with zipfile.ZipFile(file_path) as archive:
740+
compatible_files = _list_compatible_files(archive.namelist())
741+
742+
if extract_to_df:
743+
return _select_and_extract_from_zip(
744+
archive, compatible_files, selected_files
745+
)
746+
return compatible_files
747+
748+
749+
def _process_tar_archive(
750+
file_path: str, extract_to_df: bool, selected_files: list[str] | None
751+
) -> pd.DataFrame | list[str]:
752+
"""Process a TAR archive."""
753+
mode = "r:gz" if file_path.endswith(".gz") else "r"
754+
with tarfile.open(file_path, mode) as archive:
755+
compatible_files = _list_compatible_files(archive.getnames())
756+
757+
if extract_to_df:
758+
return _select_and_extract_from_tar(
759+
archive, compatible_files, selected_files
760+
)
761+
return compatible_files
762+
763+
764+
def _select_and_extract_from_zip(
765+
archive: zipfile.ZipFile,
766+
compatible_files: list[str],
767+
selected_files: list[str] | None,
768+
) -> pd.DataFrame | list[pd.DataFrame]:
769+
"""Select and read specific files from a ZIP archive."""
770+
if not selected_files:
771+
selected_files = _select_files_interactively(compatible_files)
772+
773+
dfs = []
774+
for selected_file in selected_files:
775+
with archive.open(selected_file) as file:
776+
if selected_file.endswith(".csv"):
777+
dfs.append(pd.read_csv(file))
778+
elif selected_file.endswith(".xlsx"):
779+
dfs.append(pd.read_excel(file))
780+
return dfs if len(dfs) > 1 else dfs[0]
781+
782+
783+
def _select_and_extract_from_tar(
784+
archive: tarfile.TarFile,
785+
compatible_files: list[str],
786+
selected_files: list[str] | None,
787+
) -> pd.DataFrame | list[pd.DataFrame]:
788+
"""Select and read specific files from a TAR archive."""
789+
if not selected_files:
790+
selected_files = _select_files_interactively(compatible_files)
791+
792+
dfs = []
793+
for selected_file in selected_files:
794+
member = archive.getmember(selected_file)
795+
with archive.extractfile(member) as file:
796+
if selected_file.endswith(".csv"):
797+
dfs.append(pd.read_csv(file))
798+
elif selected_file.endswith(".xlsx"):
799+
dfs.append(pd.read_excel(file))
800+
return dfs if len(dfs) > 1 else dfs[0]
801+
802+
803+
def _select_files_interactively(compatible_files: list[str]) -> list[str]:
804+
"""
805+
Allow the user to select files from a list interactively.
806+
807+
Args:
808+
compatible_files: List of compatible file names.
809+
810+
Returns:
811+
List of selected file names.
812+
"""
813+
print("Compatible files found in the archive:")
814+
for idx, file_name in enumerate(compatible_files, 1):
815+
print(f"{idx}. {file_name}")
816+
817+
selected_indices = (
818+
input(
819+
"Enter the numbers of the files to read, "
820+
"separated by commas (e.g., 1,2,3): "
821+
)
822+
.strip()
823+
.split(",")
824+
)
825+
selected_files = [
826+
compatible_files[int(idx) - 1]
827+
for idx in selected_indices
828+
if idx.strip().isdigit() and 0 < int(idx) <= len(compatible_files)
829+
]
830+
if not selected_files:
831+
raise ValueError("No valid files selected.")
832+
return selected_files
833+
834+
835+
def _list_compatible_files(file_names: list[str]) -> list[str]:
836+
"""
837+
Helper function to list compatible files (e.g., .csv, .xlsx) from an archive.
838+
839+
Args:
840+
file_names: List of file names in the archive.
841+
842+
Returns:
843+
List of compatible file names.
844+
"""
845+
compatible_files = [
846+
file_name
847+
for file_name in file_names
848+
if file_name.endswith((".csv", ".xlsx"))
849+
]
850+
print("Compatible files detected :", compatible_files)
851+
if not compatible_files:
852+
raise ValueError("No compatible files found in the archive.")
853+
return compatible_files
854+
855+
856+
def _infer_file_type(file_path: str) -> str:
857+
"""
858+
Infer the type of the archive based on the file extension.
859+
860+
Args:
861+
file_path: Path to the file.
862+
863+
Returns:
864+
A string representing the archive type ('zip', 'tar', 'tar.gz').
865+
866+
Raises:
867+
ValueError if the file extension is unsupported.
868+
"""
869+
if file_path.endswith(".zip"):
870+
return "zip"
871+
elif file_path.endswith((".tar", ".tar.gz")):
872+
return "tar.gz" if file_path.endswith(".tar.gz") else "tar"
873+
else:
874+
raise ValueError(
875+
"Cannot infer file type from the file extension. "
876+
"Please specify the 'file_type' parameter."
877+
)

tests/io/test_read_archive.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import io
2+
import tarfile
3+
import zipfile
4+
5+
import pandas as pd
6+
import pytest
7+
8+
from janitor.io import read_archive
9+
10+
11+
@pytest.fixture
12+
def zip_test_file(tmp_path):
13+
"""Fixture pour créer un fichier ZIP de test."""
14+
zip_path = tmp_path / "test.zip"
15+
with zipfile.ZipFile(zip_path, mode="w") as zf:
16+
zf.writestr("file1.csv", "col1,col2\n1,2\n3,4")
17+
zf.writestr("file2.csv", "col3,col4\n5,6\n7,8")
18+
return zip_path
19+
20+
21+
@pytest.fixture
22+
def tar_test_file(tmp_path):
23+
"""Fixture pour créer un fichier TAR de test."""
24+
tar_path = tmp_path / "test.tar.gz"
25+
with tarfile.open(tar_path, mode="w:gz") as tf:
26+
info1 = tarfile.TarInfo(name="file1.csv")
27+
data1 = io.BytesIO(b"col1,col2\n1,2\n3,4")
28+
info1.size = data1.getbuffer().nbytes
29+
tf.addfile(info1, data1)
30+
31+
info2 = tarfile.TarInfo(name="file2.csv")
32+
data2 = io.BytesIO(b"col3,col4\n5,6\n7,8")
33+
info2.size = data2.getbuffer().nbytes
34+
tf.addfile(info2, data2)
35+
return tar_path
36+
37+
38+
def test_read_zip_archive(zip_test_file):
39+
result = read_archive(
40+
str(zip_test_file), extract_to_df=True, selected_files=["file1.csv"]
41+
)
42+
assert isinstance(result, pd.DataFrame)
43+
assert list(result.columns) == ["col1", "col2"]
44+
assert result.shape == (2, 2)
45+
46+
47+
def test_list_files_in_zip(zip_test_file):
48+
result = read_archive(str(zip_test_file), extract_to_df=False)
49+
assert isinstance(result, list)
50+
assert "file1.csv" in result
51+
assert "file2.csv" in result
52+
53+
54+
def test_no_compatible_files(tmp_path):
55+
zip_path = tmp_path / "empty.zip"
56+
with zipfile.ZipFile(zip_path, mode="w") as zf:
57+
zf.writestr("file1.txt", "Just some text")
58+
with pytest.raises(
59+
ValueError, match="No compatible files found in the archive"
60+
):
61+
read_archive(str(zip_path))
62+
63+
64+
def test_read_tar_archive(tar_test_file):
65+
result = read_archive(
66+
str(tar_test_file), extract_to_df=True, selected_files=["file1.csv"]
67+
)
68+
assert isinstance(result, pd.DataFrame)
69+
assert list(result.columns) == ["col1", "col2"]
70+
assert result.shape == (2, 2)
71+
72+
73+
def test_list_files_in_tar(tar_test_file):
74+
result = read_archive(str(tar_test_file), extract_to_df=False)
75+
assert isinstance(result, list)
76+
assert "file1.csv" in result
77+
assert "file2.csv" in result

0 commit comments

Comments
 (0)