Skip to content

Commit 595fd3e

Browse files
committed
Add .sav support and delete_zip_after option
Support reading SPSS .sav files and add an option to remove compressed files after extraction. Changes: bump package version to 1.0.2, update CHANGELOG, import pyreadstat and implement _read_sav, add delete_zip_after parameter (default False) to Extractor with docs and attribute, register '.sav' handler, and add logic to track and delete zip/compressed files after extraction (with logging on success/failure). Updated tests to enable delete_zip_after=True.
1 parent 3cee6c0 commit 595fd3e

File tree

4 files changed

+47
-6
lines changed

4 files changed

+47
-6
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ The format is based on "Keep a Changelog" (https://keepachangelog.com/en/1.0.0/)
77
## [Unreleased]
88
- Prepare improvements and documentation updates.
99

10+
11+
## [1.0.2] - 2026-02-23
12+
### Added
13+
- Extractor now can read .sav files.
14+
- Parameter to extractor to delete zip after extract.
15+
1016
## [1.0.1] - 2026-02-20
1117
### Fixed
1218
- Each extracted compressed file is now stored in its own independent folder.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
# For a discussion on single-sourcing the version across setup.py and the
4040
# project code, see
4141
# https://packaging.python.org/en/latest/single_source_version.html
42-
version='1.0.1', # Required
42+
version='1.0.2', # Required
4343

4444
# This is a one-line description or tagline of what your project does. This
4545
# corresponds to the "Summary" metadata field:

src/socio4health/extractor.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import os
1616
import pandas as pd
1717
import geopandas as gpd
18+
import pyreadstat
1819
import dask.dataframe as dd
1920
from tqdm import tqdm
2021
import glob
@@ -68,7 +69,7 @@ class Extractor:
6869
colspecs : list
6970
Column specifications for fixed-width files, defining the widths of each column. Required if ``is_fwf`` is ``True``.
7071
sep : str
71-
The separator to use when reading ``CSV`` files. Defaults to ``','``.
72+
The separator to use when reading ``CSV`` files. Defaults to ``,``.
7273
ddtype : Union[str, Dict]
7374
The data type to use when reading files. Can be a single type or a dictionary mapping column names to types. Defaults to ``object``.
7475
dtype : Union[str, Dict]
@@ -79,6 +80,8 @@ class Extractor:
7980
The name or index of the Excel sheet to read. Can also be a list to read multiple sheets or ``None`` to read all sheets. Defaults to the first sheet (``0``).
8081
geodriver : str
8182
The driver to use for reading geospatial files with ``geopandas.read_file()`` (e.g., ``'ESRI Shapefile'``, ``'KML'``, etc.). Optional.
83+
delete_zip_after : bool
84+
If True, delete zip/compressed files after extraction. Defaults to False.
8285
8386
Important
8487
------
@@ -110,8 +113,9 @@ def __init__(
110113
dtype: str = None,
111114
engine: str = None,
112115
sheet_name: str = None,
113-
geodriver: str = None
114-
):
116+
geodriver: str = None,
117+
delete_zip_after: bool = False
118+
):
115119
self.compressed_ext = ['.zip', '.7z', '.tar', '.gz', '.tgz']
116120
self.depth = depth
117121
self.down_ext = down_ext if down_ext is not None else []
@@ -135,14 +139,16 @@ def __init__(
135139
'.json': self._read_json,
136140
'.geojson': self._read_geospatial,
137141
'.shp': self._read_geospatial,
138-
'.kml': self._read_geospatial
142+
'.kml': self._read_geospatial,
143+
'.sav': self._read_sav
139144
}
140145
os.makedirs(self.output_path, exist_ok=True)
141146
self.ddtype = ddtype
142147
self.dtype = dtype
143148
self.engine = engine
144149
self.sheet_name = sheet_name
145150
self.geodriver = geodriver
151+
self.delete_zip_after = delete_zip_after
146152
if not input_path:
147153
raise ValueError("input_path must be provided")
148154
if is_fwf and (not colnames or not colspecs):
@@ -297,6 +303,7 @@ def _extract_online_mode(self):
297303
def _process_downloaded_files(self, downloaded_files):
298304
"""Process downloaded files using local mode logic"""
299305
files_to_process = []
306+
zip_to_delete = []
300307

301308
# Classify and extract compressed files
302309
for filepath in downloaded_files:
@@ -311,12 +318,23 @@ def _process_downloaded_files(self, downloaded_files):
311318
down_ext=self.down_ext
312319
)
313320
files_to_process.extend(extracted)
321+
if self.delete_zip_after:
322+
zip_to_delete.append(filepath)
314323
else:
315324
files_to_process.append(filepath)
316325

317326
# Process all files (both direct downloads and extracted files)
318327
self._process_files_locally(files_to_process)
319328

329+
# Delete zip files if requested
330+
if self.delete_zip_after:
331+
for zip_path in zip_to_delete:
332+
try:
333+
os.remove(zip_path)
334+
logging.info(f"Deleted zip file after extraction: {zip_path}")
335+
except Exception as e:
336+
logging.warning(f"Could not delete zip file {zip_path}: {e}")
337+
320338
def _process_files_locally(self, files):
321339
"""Shared local processing logic used by both modes"""
322340
valid_files = 0
@@ -344,6 +362,7 @@ def _extract_local_mode(self):
344362
iter_ext = list(compressed_inter) + list(set(self.down_ext) - compressed_inter)
345363

346364
extracted_files = []
365+
zip_to_delete = []
347366

348367
for ext in iter_ext:
349368
full_pattern = os.path.join(self.input_path, f"*{ext}")
@@ -361,11 +380,22 @@ def _extract_local_mode(self):
361380
down_ext=self.down_ext
362381
)
363382
)
383+
if self.delete_zip_after:
384+
zip_to_delete.append(filepath)
364385
else:
365386
files_list.extend(glob.glob(full_pattern))
366387
# Process all files using the shared method
367388
self._process_files_locally(files_list + extracted_files)
368389

390+
# Delete zip files if requested
391+
if self.delete_zip_after:
392+
for zip_path in zip_to_delete:
393+
try:
394+
os.remove(zip_path)
395+
logging.info(f"Deleted zip file after extraction: {zip_path}")
396+
except Exception as e:
397+
logging.warning(f"Could not delete zip file {zip_path}: {e}")
398+
369399
if not self.dataframes:
370400
logging.warning("No files found matching the specified extensions.")
371401

@@ -410,6 +440,10 @@ def _read_geospatial(self, filepath):
410440
def _read_txt(self, filepath):
411441
return dd.read_csv(filepath, sep=self.sep or '\t', encoding=self.encoding, dtype=self.dtype or 'object')
412442

443+
def _read_sav(self, filepath):
444+
df, meta = pyreadstat.read_sav(filepath)
445+
return df
446+
413447
def _read_file(self, filepath):
414448
try:
415449
df = []

tests/mytest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@
5252
down_ext=['.csv', '.zip'],
5353
sep=';',
5454
output_path="data",
55-
depth=0
55+
depth=0,
56+
delete_zip_after=True
5657
)
5758

5859
per_online_extractor = Extractor(

0 commit comments

Comments
 (0)