feat: unzip and store latest dataset (#1284)

cka-y · web-flow · commit 14f61cd53f60 · 2025-07-14T09:52:31.000-04:00
diff --git a/api/src/feeds/impl/models/gtfs_dataset_impl.py b/api/src/feeds/impl/models/gtfs_dataset_impl.py
@@ -52,4 +52,10 @@ def from_orm(cls, gtfs_dataset: Gtfsdataset | None) -> GtfsDataset | None:
             service_date_range_start=gtfs_dataset.service_date_range_start,
             service_date_range_end=gtfs_dataset.service_date_range_end,
             agency_timezone=gtfs_dataset.agency_timezone,
+            unzipped_folder_size_mb=round(gtfs_dataset.unzipped_size_bytes / 1024**2, 2)
+            if gtfs_dataset.unzipped_size_bytes
+            else None,
+            zipped_folder_size_mb=round(gtfs_dataset.zipped_size_bytes / 1024**2, 2)
+            if gtfs_dataset.zipped_size_bytes
+            else None,
         )
diff --git a/api/src/feeds/impl/models/latest_dataset_impl.py b/api/src/feeds/impl/models/latest_dataset_impl.py
@@ -49,4 +49,10 @@ def from_orm(cls, dataset: Gtfsdataset | None) -> LatestDataset | None:
             agency_timezone=dataset.agency_timezone,
             hash=dataset.hash,
             validation_report=validation_report,
+            unzipped_folder_size_mb=round(dataset.unzipped_size_bytes / 1024**2, 2)
+            if dataset.unzipped_size_bytes
+            else None,
+            zipped_folder_size_mb=round(dataset.zipped_size_bytes / 1024**2, 2)
+            if dataset.zipped_size_bytes
+            else None,
         )
diff --git a/docs/DatabaseCatalogAPI.yaml b/docs/DatabaseCatalogAPI.yaml
@@ -810,6 +810,14 @@ components:
           description: The timezone of the agency.
           type: string
           example: America/Los_Angeles
+        zipped_folder_size_mb:
+          description: The size of the zipped folder in MB.
+          type: number
+          example: 100.2
+        unzipped_folder_size_mb:
+          description: The size of the unzipped folder in MB.
+          type: number
+          example: 200.5
         validation_report:
           type: object
           properties:
@@ -1020,6 +1028,14 @@ components:
               description: The timezone of the agency.
               type: string
               example: America/Los_Angeles
+            zipped_folder_size_mb:
+              description: The size of the zipped folder in MB.
+              type: number
+              example: 100.2
+            unzipped_folder_size_mb:
+              description: The size of the unzipped folder in MB.
+              type: number
+              example: 200.5
 
     BoundingBox:
       description: Bounding box of the dataset when it was first added to the catalog.
diff --git a/functions-python/batch_process_dataset/src/main.py b/functions-python/batch_process_dataset/src/main.py
@@ -22,14 +22,14 @@
 import zipfile
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Optional
+from typing import Optional, List
 
 import functions_framework
 from cloudevents.http import CloudEvent
 from google.cloud import storage
 from sqlalchemy import func
 
-from shared.database_gen.sqlacodegen_models import Gtfsdataset, t_feedsearch
+from shared.database_gen.sqlacodegen_models import Gtfsdataset, t_feedsearch, Gtfsfile
 from shared.dataset_service.main import DatasetTraceService, DatasetTrace, Status
 from shared.database.database import with_db_session, refresh_materialized_view
 import logging
@@ -48,8 +48,10 @@ class DatasetFile:
     """
 
     stable_id: str
+    extracted_files: List[Gtfsfile] = None
     file_sha256_hash: Optional[str] = None
     hosted_url: Optional[str] = None
+    zipped_size: Optional[int] = None
 
 
 class DatasetProcessor:
@@ -126,18 +128,60 @@ def download_content(self, temporary_file_path):
             logger=self.logger,
         )
         is_zip = zipfile.is_zipfile(temporary_file_path)
+        if is_zip:
+            extracted_file_path = os.path.join(
+                temporary_file_path.split(".")[0], "extracted"
+            )
+            with zipfile.ZipFile(temporary_file_path, "r") as zip_ref:
+                zip_ref.extractall(os.path.dirname(extracted_file_path))
+            # List all files in the extracted directory
+            extracted_files = os.listdir(os.path.dirname(extracted_file_path))
+            self.logger.info(f"Extracted files: {extracted_files}")
         return file_hash, is_zip
 
-    def upload_file_to_storage(self, source_file_path, target_path):
+    def upload_file_to_storage(self, source_file_path, dataset_stable_id):
         """
         Uploads a file to the GCP bucket
         """
         bucket = storage.Client().get_bucket(self.bucket_name)
-        blob = bucket.blob(target_path)
-        with open(source_file_path, "rb") as file:
-            blob.upload_from_file(file)
-        blob.make_public()
-        return blob
+        target_paths = [
+            f"{self.feed_stable_id}/latest.zip",
+            f"{self.feed_stable_id}/{dataset_stable_id}/{dataset_stable_id}.zip",
+        ]
+        blob = None
+        for target_path in target_paths:
+            blob = bucket.blob(target_path)
+            with open(source_file_path, "rb") as file:
+                blob.upload_from_file(file)
+            blob.make_public()
+
+        base_path, _ = os.path.splitext(source_file_path)
+        extracted_files_path = os.path.join(base_path, "extracted")
+        extracted_files: List[Gtfsfile] = []
+        if not os.path.exists(extracted_files_path):
+            self.logger.warning(
+                f"Extracted files path {extracted_files_path} does not exist."
+            )
+            return blob, extracted_files
+        for file_name in os.listdir(extracted_files_path):
+            file_path = os.path.join(extracted_files_path, file_name)
+            if os.path.isfile(file_path):
+                file_blob = bucket.blob(
+                    f"{self.feed_stable_id}/{dataset_stable_id}/extracted/{file_name}"
+                )
+                file_blob.upload_from_filename(file_path)
+                file_blob.make_public()
+                self.logger.info(
+                    f"Uploaded extracted file {file_name} to {file_blob.public_url}"
+                )
+                extracted_files.append(
+                    Gtfsfile(
+                        id=str(uuid.uuid4()),
+                        file_name=file_name,
+                        file_size_bytes=os.path.getsize(file_path),
+                    )
+                )
+        return blob, extracted_files
 
     def upload_dataset(self) -> DatasetFile or None:
         """
@@ -168,9 +212,6 @@ def upload_dataset(self) -> DatasetFile or None:
                 self.logger.info(
                     f"Creating file {self.feed_stable_id}/latest.zip in bucket {self.bucket_name}"
                 )
-                self.upload_file_to_storage(
-                    temp_file_path, f"{self.feed_stable_id}/latest.zip"
-                )
 
                 dataset_stable_id = self.create_dataset_stable_id(
                     self.feed_stable_id, self.date
@@ -182,15 +223,18 @@ def upload_dataset(self) -> DatasetFile or None:
                     f"Creating file: {dataset_full_path}"
                     f" in bucket {self.bucket_name}"
                 )
-                self.upload_file_to_storage(
-                    temp_file_path,
-                    f"{dataset_full_path}",
+                _, extracted_files = self.upload_file_to_storage(
+                    temp_file_path, dataset_stable_id
                 )
 
                 return DatasetFile(
                     stable_id=dataset_stable_id,
                     file_sha256_hash=file_sha256_hash,
                     hosted_url=f"{self.public_hosted_datasets_url}/{dataset_full_path}",
+                    extracted_files=extracted_files,
+                    zipped_size=os.path.getsize(temp_file_path)
+                    if os.path.exists(temp_file_path)
+                    else None,
                 )
 
             self.logger.info(
@@ -241,6 +285,15 @@ def create_dataset(self, dataset_file: DatasetFile, db_session: Session):
                 hash=dataset_file.file_sha256_hash,
                 downloaded_at=func.now(),
                 hosted_url=dataset_file.hosted_url,
+                gtfsfiles=dataset_file.extracted_files
+                if dataset_file.extracted_files
+                else [],
+                zipped_size_bytes=dataset_file.zipped_size,
+                unzipped_size_bytes=sum(
+                    [ex.file_size_bytes for ex in dataset_file.extracted_files]
+                )
+                if dataset_file.extracted_files
+                else None,
             )
             if latest_dataset:
                 latest_dataset.latest = False
diff --git a/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py b/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py
@@ -6,6 +6,9 @@
 from hashlib import sha256
 from typing import Final
 from unittest.mock import patch, MagicMock, Mock, mock_open
+
+import faker
+
 from main import (
     DatasetProcessor,
     DatasetFile,
@@ -52,7 +55,7 @@ def test_upload_dataset_diff_hash(
         mock_blob = MagicMock()
         mock_blob.public_url = public_url
         mock_blob.path = public_url
-        upload_file_to_storage.return_value = mock_blob
+        upload_file_to_storage.return_value = mock_blob, []
         mock_download_url_content.return_value = file_hash, True
 
         processor = DatasetProcessor(
@@ -78,8 +81,7 @@ def test_upload_dataset_diff_hash(
             f"/feed_stable_id-mocked_timestamp.zip",
         )
         self.assertEqual(result.file_sha256_hash, file_hash)
-        # Upload to storage is called twice, one for the latest and one for the timestamped one
-        self.assertEqual(upload_file_to_storage.call_count, 2)
+        self.assertEqual(upload_file_to_storage.call_count, 1)
 
     @patch("main.DatasetProcessor.upload_file_to_storage")
     @patch("main.DatasetProcessor.download_content")
@@ -176,7 +178,6 @@ def test_upload_dataset_download_exception(
     def test_upload_file_to_storage(self):
         bucket_name = "test-bucket"
         source_file_path = "path/to/source/file"
-        target_path = "path/to/target/file"
 
         mock_blob = Mock()
         mock_blob.public_url = public_url
@@ -202,15 +203,17 @@ def test_upload_file_to_storage(self):
                 None,
                 test_hosted_public_url,
             )
-            result = processor.upload_file_to_storage(source_file_path, target_path)
-
+            dataset_id = faker.Faker().uuid4()
+            result, _ = processor.upload_file_to_storage(source_file_path, dataset_id)
             self.assertEqual(result.public_url, public_url)
             mock_client.get_bucket.assert_called_with(bucket_name)
-            mock_bucket.blob.assert_called_with(target_path)
+            mock_bucket.blob.assert_called_with(
+                f"feed_stable_id/{dataset_id}/{dataset_id}.zip"
+            )
             mock_blob.upload_from_file.assert_called()
 
             # Assert that the file was opened in binary read mode
-            mock_file.assert_called_once_with(source_file_path, "rb")
+            mock_file.assert_called_with(source_file_path, "rb")
 
     @patch.dict(
         os.environ, {"FEEDS_CREDENTIALS": '{"test_stable_id": "test_credentials"}'}
diff --git a/liquibase/changelog.xml b/liquibase/changelog.xml
@@ -64,4 +64,5 @@
     <include file="changes/feat_1200.sql" relativeToChangelogFile="true"/>
     <include file="changes/feat_1195.sql" relativeToChangelogFile="true"/>
     <include file="changes/feat_1265_cascade_delete.sql" relativeToChangelogFile="true"/>
+    <include file="changes/feat_1259.sql" relativeToChangelogFile="true"/>
 </databaseChangeLog>
diff --git a/liquibase/changes/feat_1259.sql b/liquibase/changes/feat_1259.sql
@@ -0,0 +1,17 @@
+DROP TABLE IF EXISTS GtfsFile;
+CREATE TABLE GtfsFile
+(
+    id VARCHAR(255) PRIMARY KEY,
+    gtfs_dataset_id VARCHAR(255) NOT NULL REFERENCES GtfsDataset(id) ON DELETE CASCADE,
+    file_name VARCHAR(255) NOT NULL,
+    file_size_bytes BIGINT NOT NULL
+);
+
+ALTER TABLE GtfsDataset DROP COLUMN IF EXISTS zipped_size;
+ALTER TABLE GtfsDataset DROP COLUMN IF EXISTS unzipped_size;
+ALTER TABLE GtfsDataset DROP COLUMN IF EXISTS zipped_size_bytes;
+ALTER TABLE GtfsDataset DROP COLUMN IF EXISTS unzipped_size_bytes;
+ALTER TABLE GtfsDataset
+ADD COLUMN zipped_size_bytes BIGINT,
+ADD COLUMN unzipped_size_bytes BIGINT;
+
diff --git a/web-app/src/app/services/feeds/types.ts b/web-app/src/app/services/feeds/types.ts
@@ -427,6 +427,16 @@ export interface components {
        * @example America/Los_Angeles
        */
       agency_timezone?: string;
+      /**
+       * @description The size of the zipped folder in MB.
+       * @example 100.2
+       */
+      zipped_folder_size_mb?: number;
+      /**
+       * @description The size of the unzipped folder in MB.
+       * @example 200.5
+       */
+      unzipped_folder_size_mb?: number;
       validation_report?: {
         /**
          * @description List of GTFS features associated to the dataset. More information, https://gtfs.org/getting-started/features/overview
@@ -578,6 +588,16 @@ export interface components {
        * @example America/Los_Angeles
        */
       agency_timezone?: string;
+      /**
+       * @description The size of the zipped folder in MB.
+       * @example 100.2
+       */
+      zipped_folder_size_mb?: number;
+      /**
+       * @description The size of the unzipped folder in MB.
+       * @example 200.5
+       */
+      unzipped_folder_size_mb?: number;
     };
     /** @description Bounding box of the dataset when it was first added to the catalog. */
     BoundingBox: {