Add .sav support and delete_zip_after option

dirreno · dirreno · commit 595fd3eeb4b0 · 2026-02-23T13:26:01.000-05:00
Support reading SPSS .sav files and add an option to remove compressed files after extraction. Changes: bump package version to 1.0.2, update CHANGELOG, import pyreadstat and implement _read_sav, add delete_zip_after parameter (default False) to Extractor with docs and attribute, register '.sav' handler, and add logic to track and delete zip/compressed files after extraction (with logging on success/failure). Updated tests to enable delete_zip_after=True.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ The format is based on "Keep a Changelog" (https://keepachangelog.com/en/1.0.0/)
 ## [Unreleased]
 - Prepare improvements and documentation updates.
 
+
+## [1.0.2] - 2026-02-23
+### Added
+- Extractor now can read .sav files.
+- Parameter to extractor to delete zip after extract.
+
 ## [1.0.1] - 2026-02-20
 ### Fixed
 - Each extracted compressed file is now stored in its own independent folder.
diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@
     # For a discussion on single-sourcing the version across setup.py and the
     # project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='1.0.1',  # Required
+    version='1.0.2',  # Required
 
     # This is a one-line description or tagline of what your project does. This
     # corresponds to the "Summary" metadata field:
diff --git a/src/socio4health/extractor.py b/src/socio4health/extractor.py
@@ -15,6 +15,7 @@
 import os
 import pandas as pd
 import geopandas as gpd
+import pyreadstat
 import dask.dataframe as dd
 from tqdm import tqdm
 import glob
@@ -68,7 +69,7 @@ class Extractor:
     colspecs : list
         Column specifications for fixed-width files, defining the widths of each column. Required if ``is_fwf`` is ``True``.
     sep : str
-        The separator to use when reading ``CSV`` files. Defaults to ``','``.
+        The separator to use when reading ``CSV`` files. Defaults to ``,``.
     ddtype : Union[str, Dict]
         The data type to use when reading files. Can be a single type or a dictionary mapping column names to types. Defaults to ``object``.
     dtype : Union[str, Dict]
@@ -79,6 +80,8 @@ class Extractor:
         The name or index of the Excel sheet to read. Can also be a list to read multiple sheets or ``None`` to read all sheets. Defaults to the first sheet (``0``).
     geodriver : str
         The driver to use for reading geospatial files with ``geopandas.read_file()`` (e.g., ``'ESRI Shapefile'``, ``'KML'``, etc.). Optional.
+    delete_zip_after : bool
+        If True, delete zip/compressed files after extraction. Defaults to False.
 
     Important
     ------
@@ -110,8 +113,9 @@ def __init__(
             dtype: str = None,
             engine: str = None,
             sheet_name: str = None,
-            geodriver: str = None
-    ):
+            geodriver: str = None,
+            delete_zip_after: bool = False
+        ):
         self.compressed_ext = ['.zip', '.7z', '.tar', '.gz', '.tgz']
         self.depth = depth
         self.down_ext = down_ext if down_ext is not None else []
@@ -135,14 +139,16 @@ def __init__(
             '.json': self._read_json,
             '.geojson': self._read_geospatial,
             '.shp': self._read_geospatial,
-            '.kml': self._read_geospatial
+            '.kml': self._read_geospatial,
+            '.sav': self._read_sav
         }
         os.makedirs(self.output_path, exist_ok=True)
         self.ddtype = ddtype
         self.dtype = dtype
         self.engine = engine
         self.sheet_name = sheet_name
         self.geodriver = geodriver
+        self.delete_zip_after = delete_zip_after
         if not input_path:
             raise ValueError("input_path must be provided")
         if is_fwf and (not colnames or not colspecs):
@@ -297,6 +303,7 @@ def _extract_online_mode(self):
     def _process_downloaded_files(self, downloaded_files):
         """Process downloaded files using local mode logic"""
         files_to_process = []
+        zip_to_delete = []
 
         # Classify and extract compressed files
         for filepath in downloaded_files:
@@ -311,12 +318,23 @@ def _process_downloaded_files(self, downloaded_files):
                     down_ext=self.down_ext
                 )
                 files_to_process.extend(extracted)
+                if self.delete_zip_after:
+                    zip_to_delete.append(filepath)
             else:
                 files_to_process.append(filepath)
 
         # Process all files (both direct downloads and extracted files)
         self._process_files_locally(files_to_process)
 
+        # Delete zip files if requested
+        if self.delete_zip_after:
+            for zip_path in zip_to_delete:
+                try:
+                    os.remove(zip_path)
+                    logging.info(f"Deleted zip file after extraction: {zip_path}")
+                except Exception as e:
+                    logging.warning(f"Could not delete zip file {zip_path}: {e}")
+
     def _process_files_locally(self, files):
         """Shared local processing logic used by both modes"""
         valid_files = 0
@@ -344,6 +362,7 @@ def _extract_local_mode(self):
         iter_ext = list(compressed_inter) + list(set(self.down_ext) - compressed_inter)
 
         extracted_files = []
+        zip_to_delete = []
 
         for ext in iter_ext:
             full_pattern = os.path.join(self.input_path, f"*{ext}")
@@ -361,11 +380,22 @@ def _extract_local_mode(self):
                             down_ext=self.down_ext
                         )
                     )
+                    if self.delete_zip_after:
+                        zip_to_delete.append(filepath)
             else:
                 files_list.extend(glob.glob(full_pattern))
         # Process all files using the shared method
         self._process_files_locally(files_list + extracted_files)
 
+        # Delete zip files if requested
+        if self.delete_zip_after:
+            for zip_path in zip_to_delete:
+                try:
+                    os.remove(zip_path)
+                    logging.info(f"Deleted zip file after extraction: {zip_path}")
+                except Exception as e:
+                    logging.warning(f"Could not delete zip file {zip_path}: {e}")
+
         if not self.dataframes:
             logging.warning("No files found matching the specified extensions.")
 
@@ -410,6 +440,10 @@ def _read_geospatial(self, filepath):
     def _read_txt(self, filepath):
         return dd.read_csv(filepath, sep=self.sep or '\t', encoding=self.encoding, dtype=self.dtype or 'object')
 
+    def _read_sav(self, filepath):
+        df, meta = pyreadstat.read_sav(filepath)
+        return df
+    
     def _read_file(self, filepath):
         try:
             df = []
diff --git a/tests/mytest.py b/tests/mytest.py
@@ -52,7 +52,8 @@
     down_ext=['.csv', '.zip'],
     sep=';',
     output_path="data",
-    depth=0
+    depth=0,
+    delete_zip_after=True
 )
 
 per_online_extractor = Extractor(

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,8 @@`
`52`	`52`	`down_ext=['.csv', '.zip'],`
`53`	`53`	`sep=';',`
`54`	`54`	`output_path="data",`
`55`		`- depth=0`
	`55`	`+ depth=0,`
	`56`	`+ delete_zip_after=True`
`56`	`57`	`)`
`57`	`58`
`58`	`59`	`per_online_extractor = Extractor(`