Skip to content

Commit 1024c95

Browse files
committed
Unzip files after hash check.
1 parent b145811 commit 1024c95

File tree

2 files changed

+19
-15
lines changed

2 files changed

+19
-15
lines changed

functions-python/batch_process_dataset/src/main.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -128,19 +128,7 @@ def download_content(self, temporary_file_path):
128128
logger=self.logger,
129129
)
130130
is_zip = zipfile.is_zipfile(temporary_file_path)
131-
extracted_files_path = None
132-
if is_zip:
133-
extracted_files_path = os.path.join(
134-
temporary_file_path.split(".")[0], "extracted"
135-
)
136-
# Create the directory for extracted files if it does not exist
137-
os.makedirs(extracted_files_path, exist_ok=True)
138-
with zipfile.ZipFile(temporary_file_path, "r") as zip_ref:
139-
zip_ref.extractall(path=extracted_files_path)
140-
# List all files in the extracted directory
141-
extracted_files = os.listdir(extracted_files_path)
142-
self.logger.info(f"Extracted files: {extracted_files}")
143-
return file_hash, is_zip, extracted_files_path
131+
return file_hash, is_zip
144132

145133
def upload_file_to_storage(
146134
self, source_file_path, dataset_stable_id, extracted_files_path
@@ -209,12 +197,14 @@ def upload_dataset(self) -> DatasetFile or None:
209197
self.logger.info(
210198
f"[{self.feed_stable_id}] File hash is {file_sha256_hash}."
211199
)
212-
213200
if self.latest_hash != file_sha256_hash:
214201
self.logger.info(
215202
f"[{self.feed_stable_id}] Dataset has changed (hash {self.latest_hash}"
216203
f"-> {file_sha256_hash}). Uploading new version."
217204
)
205+
extracted_files_path = self.unzip_files(
206+
extracted_files_path, temp_file_path
207+
)
218208
self.logger.info(
219209
f"Creating file {self.feed_stable_id}/latest.zip in bucket {self.bucket_name}"
220210
)
@@ -252,6 +242,18 @@ def upload_dataset(self) -> DatasetFile or None:
252242
os.remove(temp_file_path)
253243
return None
254244

245+
def unzip_files(self, extracted_files_path, temp_file_path):
246+
extracted_files_path = os.path.join(temp_file_path.split(".")[0], "extracted")
247+
self.logger.info(f"Unzipping files to {extracted_files_path}")
248+
# Create the directory for extracted files if it does not exist
249+
os.makedirs(extracted_files_path, exist_ok=True)
250+
with zipfile.ZipFile(temp_file_path, "r") as zip_ref:
251+
zip_ref.extractall(path=extracted_files_path)
252+
# List all files in the extracted directory
253+
extracted_files = os.listdir(extracted_files_path)
254+
self.logger.info(f"Extracted files: {extracted_files}")
255+
return extracted_files_path
256+
255257
def generate_temp_filename(self):
256258
"""
257259
Generates a temporary filename

functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,9 @@ def create_cloud_event(mock_data):
4646
class TestDatasetProcessor(unittest.TestCase):
4747
@patch("main.DatasetProcessor.upload_file_to_storage")
4848
@patch("main.DatasetProcessor.download_content")
49+
@patch("main.DatasetProcessor.unzip_files")
4950
def test_upload_dataset_diff_hash(
50-
self, mock_download_url_content, upload_file_to_storage
51+
self, mock_unzip_files, mock_download_url_content, upload_file_to_storage
5152
):
5253
"""
5354
Test upload_dataset method of DatasetProcessor class with different hash from the latest one
@@ -57,6 +58,7 @@ def test_upload_dataset_diff_hash(
5758
mock_blob.path = public_url
5859
upload_file_to_storage.return_value = mock_blob, []
5960
mock_download_url_content.return_value = file_hash, True, "path/file"
61+
mock_unzip_files.return_value = [mock_blob, mock_blob]
6062

6163
processor = DatasetProcessor(
6264
public_url,

0 commit comments

Comments
 (0)