|
3 | 3 | import inspect
|
4 | 4 | import os
|
5 | 5 | import subprocess
|
| 6 | +import tarfile |
6 | 7 | import warnings
|
| 8 | +import zipfile |
7 | 9 | from collections import defaultdict
|
8 | 10 | from glob import glob
|
9 | 11 | from io import StringIO
|
@@ -689,3 +691,187 @@ def _object_to_dict(obj):
|
689 | 691 | data[key] = _object_to_dict(value)
|
690 | 692 | return data
|
691 | 693 | return obj
|
| 694 | + |
| 695 | + |
| 696 | +################################################################# |
| 697 | + |
| 698 | + |
| 699 | +def read_archive( |
| 700 | + file_path: str, |
| 701 | + extract_to_df: bool = True, |
| 702 | + file_type: str | None = None, |
| 703 | + selected_files: list[str] | None = None, |
| 704 | +) -> pd.DataFrame | list[str]: |
| 705 | + """ |
| 706 | + Reads an archive file (.zip, .tar, .tar.gz) and optionally lists its content |
| 707 | + or extracts specific files into a DataFrame. |
| 708 | +
|
| 709 | + Args: |
| 710 | + file_path: The path to the archive file. |
| 711 | + extract_to_df: Whether to read the contents into a DataFrame |
| 712 | + (for CSV or similar formats). Default is True. |
| 713 | + file_type: Optional file type hint ('zip', 'tar', 'tar.gz'). |
| 714 | + If None, it will be inferred from the file extension. |
| 715 | + selected_files: List of files to read directly without user interaction. |
| 716 | +
|
| 717 | + Returns: |
| 718 | + - A pandas DataFrame if extract_to_df is True |
| 719 | + and the user selects a file to load. |
| 720 | + - A list of dataframes that contains |
| 721 | + compatible file names in the archive otherwise. |
| 722 | + """ |
| 723 | + file_type = file_type or _infer_file_type(file_path) |
| 724 | + |
| 725 | + if file_type == "zip": |
| 726 | + return _process_zip_archive(file_path, extract_to_df, selected_files) |
| 727 | + elif file_type in {"tar", "tar.gz"}: |
| 728 | + return _process_tar_archive(file_path, extract_to_df, selected_files) |
| 729 | + else: |
| 730 | + raise ValueError( |
| 731 | + "Unsupported archive format. Supported formats are .zip, .tar, or .tar.gz." |
| 732 | + ) |
| 733 | + |
| 734 | + |
| 735 | +def _process_zip_archive( |
| 736 | + file_path: str, extract_to_df: bool, selected_files: list[str] | None |
| 737 | +) -> pd.DataFrame | list[str]: |
| 738 | + """Process a ZIP archive.""" |
| 739 | + with zipfile.ZipFile(file_path) as archive: |
| 740 | + compatible_files = _list_compatible_files(archive.namelist()) |
| 741 | + |
| 742 | + if extract_to_df: |
| 743 | + return _select_and_extract_from_zip( |
| 744 | + archive, compatible_files, selected_files |
| 745 | + ) |
| 746 | + return compatible_files |
| 747 | + |
| 748 | + |
| 749 | +def _process_tar_archive( |
| 750 | + file_path: str, extract_to_df: bool, selected_files: list[str] | None |
| 751 | +) -> pd.DataFrame | list[str]: |
| 752 | + """Process a TAR archive.""" |
| 753 | + mode = "r:gz" if file_path.endswith(".gz") else "r" |
| 754 | + with tarfile.open(file_path, mode) as archive: |
| 755 | + compatible_files = _list_compatible_files(archive.getnames()) |
| 756 | + |
| 757 | + if extract_to_df: |
| 758 | + return _select_and_extract_from_tar( |
| 759 | + archive, compatible_files, selected_files |
| 760 | + ) |
| 761 | + return compatible_files |
| 762 | + |
| 763 | + |
| 764 | +def _select_and_extract_from_zip( |
| 765 | + archive: zipfile.ZipFile, |
| 766 | + compatible_files: list[str], |
| 767 | + selected_files: list[str] | None, |
| 768 | +) -> pd.DataFrame | list[pd.DataFrame]: |
| 769 | + """Select and read specific files from a ZIP archive.""" |
| 770 | + if not selected_files: |
| 771 | + selected_files = _select_files_interactively(compatible_files) |
| 772 | + |
| 773 | + dfs = [] |
| 774 | + for selected_file in selected_files: |
| 775 | + with archive.open(selected_file) as file: |
| 776 | + if selected_file.endswith(".csv"): |
| 777 | + dfs.append(pd.read_csv(file)) |
| 778 | + elif selected_file.endswith(".xlsx"): |
| 779 | + dfs.append(pd.read_excel(file)) |
| 780 | + return dfs if len(dfs) > 1 else dfs[0] |
| 781 | + |
| 782 | + |
| 783 | +def _select_and_extract_from_tar( |
| 784 | + archive: tarfile.TarFile, |
| 785 | + compatible_files: list[str], |
| 786 | + selected_files: list[str] | None, |
| 787 | +) -> pd.DataFrame | list[pd.DataFrame]: |
| 788 | + """Select and read specific files from a TAR archive.""" |
| 789 | + if not selected_files: |
| 790 | + selected_files = _select_files_interactively(compatible_files) |
| 791 | + |
| 792 | + dfs = [] |
| 793 | + for selected_file in selected_files: |
| 794 | + member = archive.getmember(selected_file) |
| 795 | + with archive.extractfile(member) as file: |
| 796 | + if selected_file.endswith(".csv"): |
| 797 | + dfs.append(pd.read_csv(file)) |
| 798 | + elif selected_file.endswith(".xlsx"): |
| 799 | + dfs.append(pd.read_excel(file)) |
| 800 | + return dfs if len(dfs) > 1 else dfs[0] |
| 801 | + |
| 802 | + |
| 803 | +def _select_files_interactively(compatible_files: list[str]) -> list[str]: |
| 804 | + """ |
| 805 | + Allow the user to select files from a list interactively. |
| 806 | +
|
| 807 | + Args: |
| 808 | + compatible_files: List of compatible file names. |
| 809 | +
|
| 810 | + Returns: |
| 811 | + List of selected file names. |
| 812 | + """ |
| 813 | + print("Compatible files found in the archive:") |
| 814 | + for idx, file_name in enumerate(compatible_files, 1): |
| 815 | + print(f"{idx}. {file_name}") |
| 816 | + |
| 817 | + selected_indices = ( |
| 818 | + input( |
| 819 | + "Enter the numbers of the files to read, " |
| 820 | + "separated by commas (e.g., 1,2,3): " |
| 821 | + ) |
| 822 | + .strip() |
| 823 | + .split(",") |
| 824 | + ) |
| 825 | + selected_files = [ |
| 826 | + compatible_files[int(idx) - 1] |
| 827 | + for idx in selected_indices |
| 828 | + if idx.strip().isdigit() and 0 < int(idx) <= len(compatible_files) |
| 829 | + ] |
| 830 | + if not selected_files: |
| 831 | + raise ValueError("No valid files selected.") |
| 832 | + return selected_files |
| 833 | + |
| 834 | + |
| 835 | +def _list_compatible_files(file_names: list[str]) -> list[str]: |
| 836 | + """ |
| 837 | + Helper function to list compatible files (e.g., .csv, .xlsx) from an archive. |
| 838 | +
|
| 839 | + Args: |
| 840 | + file_names: List of file names in the archive. |
| 841 | +
|
| 842 | + Returns: |
| 843 | + List of compatible file names. |
| 844 | + """ |
| 845 | + compatible_files = [ |
| 846 | + file_name |
| 847 | + for file_name in file_names |
| 848 | + if file_name.endswith((".csv", ".xlsx")) |
| 849 | + ] |
| 850 | + print("Compatible files detected :", compatible_files) |
| 851 | + if not compatible_files: |
| 852 | + raise ValueError("No compatible files found in the archive.") |
| 853 | + return compatible_files |
| 854 | + |
| 855 | + |
| 856 | +def _infer_file_type(file_path: str) -> str: |
| 857 | + """ |
| 858 | + Infer the type of the archive based on the file extension. |
| 859 | +
|
| 860 | + Args: |
| 861 | + file_path: Path to the file. |
| 862 | +
|
| 863 | + Returns: |
| 864 | + A string representing the archive type ('zip', 'tar', 'tar.gz'). |
| 865 | +
|
| 866 | + Raises: |
| 867 | + ValueError if the file extension is unsupported. |
| 868 | + """ |
| 869 | + if file_path.endswith(".zip"): |
| 870 | + return "zip" |
| 871 | + elif file_path.endswith((".tar", ".tar.gz")): |
| 872 | + return "tar.gz" if file_path.endswith(".tar.gz") else "tar" |
| 873 | + else: |
| 874 | + raise ValueError( |
| 875 | + "Cannot infer file type from the file extension. " |
| 876 | + "Please specify the 'file_type' parameter." |
| 877 | + ) |
0 commit comments