Skip to content

Commit e89d5d5

Browse files
committed
Add Deflate64 extraction support and bump version
Enable extraction of Deflate64-compressed ZIPs by adding zipfile-deflate64 and pyzipper dependencies and falling back to pyzipper when zipfile raises NotImplementedError. Detect Deflate64 entries and log a warning during extraction. Bump package version to 1.0.3 and add changelog entry. Also pass encoding to pyreadstat.read_sav, update test input to use .sav and adjust test flow (comment out vertical merge and disable download folder deletion).
1 parent 595fd3e commit e89d5d5

File tree

6 files changed

+33
-12
lines changed

6 files changed

+33
-12
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ The format is based on "Keep a Changelog" (https://keepachangelog.com/en/1.0.0/)
77
## [Unreleased]
88
- Prepare improvements and documentation updates.
99

10+
## [1.0.3] - 2026-02-23
11+
### Fixed
12+
- Extractor now decompress deflated64
1013

1114
## [1.0.2] - 2026-02-23
1215
### Added

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,6 @@ deep_translator
1414
transformers
1515
torch
1616
pytest
17-
geopandas
17+
geopandas
18+
pyzipper
19+
zipfile-deflate64

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
# For a discussion on single-sourcing the version across setup.py and the
4040
# project code, see
4141
# https://packaging.python.org/en/latest/single_source_version.html
42-
version='1.0.2', # Required
42+
version='1.0.3', # Required
4343

4444
# This is a one-line description or tagline of what your project does. This
4545
# corresponds to the "Summary" metadata field:

src/socio4health/extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ def _read_txt(self, filepath):
441441
return dd.read_csv(filepath, sep=self.sep or '\t', encoding=self.encoding, dtype=self.dtype or 'object')
442442

443443
def _read_sav(self, filepath):
444-
df, meta = pyreadstat.read_sav(filepath)
444+
df, meta = pyreadstat.read_sav(filepath, encoding=self.encoding)
445445
return df
446446

447447
def _read_file(self, filepath):

src/socio4health/utils/extractor_utils.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import zipfile_deflate64
2+
import pyzipper
13
from scrapy.crawler import CrawlerProcess
24
from .standard_spider import StandardSpider
35
import zipfile
@@ -145,8 +147,21 @@ def compressed2files(input_archive, target_directory, down_ext, current_depth=0,
145147
try:
146148
# Extract the archive
147149
if zipfile.is_zipfile(input_archive):
148-
with zipfile.ZipFile(input_archive, 'r') as zip_ref:
149-
zip_ref.extractall(temp_dir)
150+
try:
151+
with zipfile.ZipFile(input_archive, 'r') as zip_ref:
152+
for zinfo in zip_ref.infolist():
153+
if getattr(zinfo, 'compress_type', None) == 9:
154+
logging.warning(f"Extracting Deflate64-compressed zip file: {input_archive}. This may take a while...")
155+
break
156+
zip_ref.extractall(temp_dir)
157+
except NotImplementedError as e:
158+
logging.warning(f"zipfile failed for {input_archive}: {e}. Trying pyzipper fallback.")
159+
try:
160+
with pyzipper.ZipFile(input_archive, 'r') as zip_ref:
161+
zip_ref.extractall(temp_dir)
162+
except Exception as e2:
163+
logging.error(f"pyzipper extraction failed for {input_archive}: {e2}")
164+
return set()
150165
elif tarfile.is_tarfile(input_archive):
151166
with tarfile.open(input_archive, 'r:*') as tar_ref:
152167
tar_ref.extractall(temp_dir)

tests/mytest.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@
4848

4949
# Online extractors
5050
col_online_extractor = Extractor(
51-
input_path="https://microdatos.dane.gov.co/index.php/catalog/771/get-microdata",
52-
down_ext=['.csv', '.zip'],
51+
input_path="https://microdatos.dane.gov.co/index.php/catalog/827/get-microdata",
52+
down_ext=['.sav', '.zip'],
5353
sep=';',
5454
output_path="data",
5555
depth=0,
@@ -106,14 +106,15 @@ def test():
106106
print('Extracting data...')
107107
dfs = extractor.s4h_extract()
108108

109-
print('Vertical merge_____________________________________')
110-
dfs = har.s4h_vertical_merge(dfs)
111-
112109
for i, df in enumerate(dfs):
113110
print(f"DataFrame {i + 1} shape: {df.shape}")
114111
print(df.head())
115112
print("-" * 50)
116113

114+
"""
115+
print('Vertical merge_____________________________________')
116+
dfs = har.s4h_vertical_merge(dfs)
117+
117118
har.categories = ["Business"]
118119
har.key_col = 'DPTO'
119120
har.key_val = ['11']
@@ -123,7 +124,7 @@ def test():
123124
124125
print(filtered_dask_dfs[0].head())
125126
126-
"""
127+
127128
print('Horizontal merge___________________________________')
128129
joined_df = har.s4h_join_data(filtered_ddfs)
129130
available_cols = joined_df.columns.tolist()
@@ -136,7 +137,7 @@ def test():
136137
joined_df.to_csv('data/GEIH_2022_harmonized.csv', index=False)
137138
"""
138139

139-
extractor.s4h_delete_download_folder()
140+
#extractor.s4h_delete_download_folder()
140141

141142
if __name__ == "__main__":
142143
test()

0 commit comments

Comments
 (0)