Skip to content

Commit 14f61cd

Browse files
authored
feat: unzip and store latest dataset (#1284)
1 parent d707c0f commit 14f61cd

File tree

8 files changed

+144
-22
lines changed

8 files changed

+144
-22
lines changed

api/src/feeds/impl/models/gtfs_dataset_impl.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,10 @@ def from_orm(cls, gtfs_dataset: Gtfsdataset | None) -> GtfsDataset | None:
5252
service_date_range_start=gtfs_dataset.service_date_range_start,
5353
service_date_range_end=gtfs_dataset.service_date_range_end,
5454
agency_timezone=gtfs_dataset.agency_timezone,
55+
unzipped_folder_size_mb=round(gtfs_dataset.unzipped_size_bytes / 1024**2, 2)
56+
if gtfs_dataset.unzipped_size_bytes
57+
else None,
58+
zipped_folder_size_mb=round(gtfs_dataset.zipped_size_bytes / 1024**2, 2)
59+
if gtfs_dataset.zipped_size_bytes
60+
else None,
5561
)

api/src/feeds/impl/models/latest_dataset_impl.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,10 @@ def from_orm(cls, dataset: Gtfsdataset | None) -> LatestDataset | None:
4949
agency_timezone=dataset.agency_timezone,
5050
hash=dataset.hash,
5151
validation_report=validation_report,
52+
unzipped_folder_size_mb=round(dataset.unzipped_size_bytes / 1024**2, 2)
53+
if dataset.unzipped_size_bytes
54+
else None,
55+
zipped_folder_size_mb=round(dataset.zipped_size_bytes / 1024**2, 2)
56+
if dataset.zipped_size_bytes
57+
else None,
5258
)

docs/DatabaseCatalogAPI.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,14 @@ components:
810810
description: The timezone of the agency.
811811
type: string
812812
example: America/Los_Angeles
813+
zipped_folder_size_mb:
814+
description: The size of the zipped folder in MB.
815+
type: number
816+
example: 100.2
817+
unzipped_folder_size_mb:
818+
description: The size of the unzipped folder in MB.
819+
type: number
820+
example: 200.5
813821
validation_report:
814822
type: object
815823
properties:
@@ -1020,6 +1028,14 @@ components:
10201028
description: The timezone of the agency.
10211029
type: string
10221030
example: America/Los_Angeles
1031+
zipped_folder_size_mb:
1032+
description: The size of the zipped folder in MB.
1033+
type: number
1034+
example: 100.2
1035+
unzipped_folder_size_mb:
1036+
description: The size of the unzipped folder in MB.
1037+
type: number
1038+
example: 200.5
10231039

10241040
BoundingBox:
10251041
description: Bounding box of the dataset when it was first added to the catalog.

functions-python/batch_process_dataset/src/main.py

Lines changed: 67 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,14 @@
2222
import zipfile
2323
from dataclasses import dataclass
2424
from datetime import datetime
25-
from typing import Optional
25+
from typing import Optional, List
2626

2727
import functions_framework
2828
from cloudevents.http import CloudEvent
2929
from google.cloud import storage
3030
from sqlalchemy import func
3131

32-
from shared.database_gen.sqlacodegen_models import Gtfsdataset, t_feedsearch
32+
from shared.database_gen.sqlacodegen_models import Gtfsdataset, t_feedsearch, Gtfsfile
3333
from shared.dataset_service.main import DatasetTraceService, DatasetTrace, Status
3434
from shared.database.database import with_db_session, refresh_materialized_view
3535
import logging
@@ -48,8 +48,10 @@ class DatasetFile:
4848
"""
4949

5050
stable_id: str
51+
extracted_files: List[Gtfsfile] = None
5152
file_sha256_hash: Optional[str] = None
5253
hosted_url: Optional[str] = None
54+
zipped_size: Optional[int] = None
5355

5456

5557
class DatasetProcessor:
@@ -126,18 +128,60 @@ def download_content(self, temporary_file_path):
126128
logger=self.logger,
127129
)
128130
is_zip = zipfile.is_zipfile(temporary_file_path)
131+
if is_zip:
132+
extracted_file_path = os.path.join(
133+
temporary_file_path.split(".")[0], "extracted"
134+
)
135+
with zipfile.ZipFile(temporary_file_path, "r") as zip_ref:
136+
zip_ref.extractall(os.path.dirname(extracted_file_path))
137+
# List all files in the extracted directory
138+
extracted_files = os.listdir(os.path.dirname(extracted_file_path))
139+
self.logger.info(f"Extracted files: {extracted_files}")
129140
return file_hash, is_zip
130141

131-
def upload_file_to_storage(self, source_file_path, target_path):
142+
def upload_file_to_storage(self, source_file_path, dataset_stable_id):
132143
"""
133144
Uploads a file to the GCP bucket
134145
"""
135146
bucket = storage.Client().get_bucket(self.bucket_name)
136-
blob = bucket.blob(target_path)
137-
with open(source_file_path, "rb") as file:
138-
blob.upload_from_file(file)
139-
blob.make_public()
140-
return blob
147+
target_paths = [
148+
f"{self.feed_stable_id}/latest.zip",
149+
f"{self.feed_stable_id}/{dataset_stable_id}/{dataset_stable_id}.zip",
150+
]
151+
blob = None
152+
for target_path in target_paths:
153+
blob = bucket.blob(target_path)
154+
with open(source_file_path, "rb") as file:
155+
blob.upload_from_file(file)
156+
blob.make_public()
157+
158+
base_path, _ = os.path.splitext(source_file_path)
159+
extracted_files_path = os.path.join(base_path, "extracted")
160+
extracted_files: List[Gtfsfile] = []
161+
if not os.path.exists(extracted_files_path):
162+
self.logger.warning(
163+
f"Extracted files path {extracted_files_path} does not exist."
164+
)
165+
return blob, extracted_files
166+
for file_name in os.listdir(extracted_files_path):
167+
file_path = os.path.join(extracted_files_path, file_name)
168+
if os.path.isfile(file_path):
169+
file_blob = bucket.blob(
170+
f"{self.feed_stable_id}/{dataset_stable_id}/extracted/{file_name}"
171+
)
172+
file_blob.upload_from_filename(file_path)
173+
file_blob.make_public()
174+
self.logger.info(
175+
f"Uploaded extracted file {file_name} to {file_blob.public_url}"
176+
)
177+
extracted_files.append(
178+
Gtfsfile(
179+
id=str(uuid.uuid4()),
180+
file_name=file_name,
181+
file_size_bytes=os.path.getsize(file_path),
182+
)
183+
)
184+
return blob, extracted_files
141185

142186
def upload_dataset(self) -> DatasetFile or None:
143187
"""
@@ -168,9 +212,6 @@ def upload_dataset(self) -> DatasetFile or None:
168212
self.logger.info(
169213
f"Creating file {self.feed_stable_id}/latest.zip in bucket {self.bucket_name}"
170214
)
171-
self.upload_file_to_storage(
172-
temp_file_path, f"{self.feed_stable_id}/latest.zip"
173-
)
174215

175216
dataset_stable_id = self.create_dataset_stable_id(
176217
self.feed_stable_id, self.date
@@ -182,15 +223,18 @@ def upload_dataset(self) -> DatasetFile or None:
182223
f"Creating file: {dataset_full_path}"
183224
f" in bucket {self.bucket_name}"
184225
)
185-
self.upload_file_to_storage(
186-
temp_file_path,
187-
f"{dataset_full_path}",
226+
_, extracted_files = self.upload_file_to_storage(
227+
temp_file_path, dataset_stable_id
188228
)
189229

190230
return DatasetFile(
191231
stable_id=dataset_stable_id,
192232
file_sha256_hash=file_sha256_hash,
193233
hosted_url=f"{self.public_hosted_datasets_url}/{dataset_full_path}",
234+
extracted_files=extracted_files,
235+
zipped_size=os.path.getsize(temp_file_path)
236+
if os.path.exists(temp_file_path)
237+
else None,
194238
)
195239

196240
self.logger.info(
@@ -241,6 +285,15 @@ def create_dataset(self, dataset_file: DatasetFile, db_session: Session):
241285
hash=dataset_file.file_sha256_hash,
242286
downloaded_at=func.now(),
243287
hosted_url=dataset_file.hosted_url,
288+
gtfsfiles=dataset_file.extracted_files
289+
if dataset_file.extracted_files
290+
else [],
291+
zipped_size_bytes=dataset_file.zipped_size,
292+
unzipped_size_bytes=sum(
293+
[ex.file_size_bytes for ex in dataset_file.extracted_files]
294+
)
295+
if dataset_file.extracted_files
296+
else None,
244297
)
245298
if latest_dataset:
246299
latest_dataset.latest = False

functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
from hashlib import sha256
77
from typing import Final
88
from unittest.mock import patch, MagicMock, Mock, mock_open
9+
10+
import faker
11+
912
from main import (
1013
DatasetProcessor,
1114
DatasetFile,
@@ -52,7 +55,7 @@ def test_upload_dataset_diff_hash(
5255
mock_blob = MagicMock()
5356
mock_blob.public_url = public_url
5457
mock_blob.path = public_url
55-
upload_file_to_storage.return_value = mock_blob
58+
upload_file_to_storage.return_value = mock_blob, []
5659
mock_download_url_content.return_value = file_hash, True
5760

5861
processor = DatasetProcessor(
@@ -78,8 +81,7 @@ def test_upload_dataset_diff_hash(
7881
f"/feed_stable_id-mocked_timestamp.zip",
7982
)
8083
self.assertEqual(result.file_sha256_hash, file_hash)
81-
# Upload to storage is called twice, one for the latest and one for the timestamped one
82-
self.assertEqual(upload_file_to_storage.call_count, 2)
84+
self.assertEqual(upload_file_to_storage.call_count, 1)
8385

8486
@patch("main.DatasetProcessor.upload_file_to_storage")
8587
@patch("main.DatasetProcessor.download_content")
@@ -176,7 +178,6 @@ def test_upload_dataset_download_exception(
176178
def test_upload_file_to_storage(self):
177179
bucket_name = "test-bucket"
178180
source_file_path = "path/to/source/file"
179-
target_path = "path/to/target/file"
180181

181182
mock_blob = Mock()
182183
mock_blob.public_url = public_url
@@ -202,15 +203,17 @@ def test_upload_file_to_storage(self):
202203
None,
203204
test_hosted_public_url,
204205
)
205-
result = processor.upload_file_to_storage(source_file_path, target_path)
206-
206+
dataset_id = faker.Faker().uuid4()
207+
result, _ = processor.upload_file_to_storage(source_file_path, dataset_id)
207208
self.assertEqual(result.public_url, public_url)
208209
mock_client.get_bucket.assert_called_with(bucket_name)
209-
mock_bucket.blob.assert_called_with(target_path)
210+
mock_bucket.blob.assert_called_with(
211+
f"feed_stable_id/{dataset_id}/{dataset_id}.zip"
212+
)
210213
mock_blob.upload_from_file.assert_called()
211214

212215
# Assert that the file was opened in binary read mode
213-
mock_file.assert_called_once_with(source_file_path, "rb")
216+
mock_file.assert_called_with(source_file_path, "rb")
214217

215218
@patch.dict(
216219
os.environ, {"FEEDS_CREDENTIALS": '{"test_stable_id": "test_credentials"}'}

liquibase/changelog.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,5 @@
6464
<include file="changes/feat_1200.sql" relativeToChangelogFile="true"/>
6565
<include file="changes/feat_1195.sql" relativeToChangelogFile="true"/>
6666
<include file="changes/feat_1265_cascade_delete.sql" relativeToChangelogFile="true"/>
67+
<include file="changes/feat_1259.sql" relativeToChangelogFile="true"/>
6768
</databaseChangeLog>

liquibase/changes/feat_1259.sql

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
DROP TABLE IF EXISTS GtfsFile;
2+
CREATE TABLE GtfsFile
3+
(
4+
id VARCHAR(255) PRIMARY KEY,
5+
gtfs_dataset_id VARCHAR(255) NOT NULL REFERENCES GtfsDataset(id) ON DELETE CASCADE,
6+
file_name VARCHAR(255) NOT NULL,
7+
file_size_bytes BIGINT NOT NULL
8+
);
9+
10+
ALTER TABLE GtfsDataset DROP COLUMN IF EXISTS zipped_size;
11+
ALTER TABLE GtfsDataset DROP COLUMN IF EXISTS unzipped_size;
12+
ALTER TABLE GtfsDataset DROP COLUMN IF EXISTS zipped_size_bytes;
13+
ALTER TABLE GtfsDataset DROP COLUMN IF EXISTS unzipped_size_bytes;
14+
ALTER TABLE GtfsDataset
15+
ADD COLUMN zipped_size_bytes BIGINT,
16+
ADD COLUMN unzipped_size_bytes BIGINT;
17+

web-app/src/app/services/feeds/types.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,16 @@ export interface components {
427427
* @example America/Los_Angeles
428428
*/
429429
agency_timezone?: string;
430+
/**
431+
* @description The size of the zipped folder in MB.
432+
* @example 100.2
433+
*/
434+
zipped_folder_size_mb?: number;
435+
/**
436+
* @description The size of the unzipped folder in MB.
437+
* @example 200.5
438+
*/
439+
unzipped_folder_size_mb?: number;
430440
validation_report?: {
431441
/**
432442
* @description List of GTFS features associated to the dataset. More information, https://gtfs.org/getting-started/features/overview
@@ -578,6 +588,16 @@ export interface components {
578588
* @example America/Los_Angeles
579589
*/
580590
agency_timezone?: string;
591+
/**
592+
* @description The size of the zipped folder in MB.
593+
* @example 100.2
594+
*/
595+
zipped_folder_size_mb?: number;
596+
/**
597+
* @description The size of the unzipped folder in MB.
598+
* @example 200.5
599+
*/
600+
unzipped_folder_size_mb?: number;
581601
};
582602
/** @description Bounding box of the dataset when it was first added to the catalog. */
583603
BoundingBox: {

0 commit comments

Comments
 (0)