Skip to content

Commit f1f331f

Browse files
committed
updated process dataset function
1 parent b56a193 commit f1f331f

File tree

4 files changed

+25
-15
lines changed

4 files changed

+25
-15
lines changed

functions-python/batch_process_dataset/src/main.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import base64
1818
import json
19+
import logging
1920
import os
2021
import random
2122
import uuid
@@ -28,15 +29,13 @@
2829
from cloudevents.http import CloudEvent
2930
from google.cloud import storage
3031
from sqlalchemy import func
32+
from sqlalchemy.orm import Session
3133

34+
from shared.database.database import with_db_session, refresh_materialized_view
3235
from shared.database_gen.sqlacodegen_models import Gtfsdataset, t_feedsearch, Gtfsfile
3336
from shared.dataset_service.main import DatasetTraceService, DatasetTrace, Status
34-
from shared.database.database import with_db_session, refresh_materialized_view
35-
import logging
36-
3737
from shared.helpers.logger import init_logger, get_logger
38-
from shared.helpers.utils import download_and_get_hash
39-
from sqlalchemy.orm import Session
38+
from shared.helpers.utils import download_and_get_hash, get_hash_from_file
4039

4140
init_logger()
4241

@@ -177,6 +176,8 @@ def upload_file_to_storage(
177176
id=str(uuid.uuid4()),
178177
file_name=file_name,
179178
file_size_bytes=os.path.getsize(file_path),
179+
hosted_url=file_blob.public_url if public else None,
180+
hash=get_hash_from_file(file_path),
180181
)
181182
)
182183
return blob, extracted_files

functions-python/helpers/utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,17 @@ def download_url_content(url, with_retry=False):
7878
raise e
7979

8080

81+
def get_hash_from_file(file_path, hash_algorithm="sha256", chunk_size=8192):
82+
"""
83+
Returns the hash of a file
84+
"""
85+
hash_object = hashlib.new(hash_algorithm)
86+
with open(file_path, "rb") as f:
87+
for chunk in iter(lambda: f.read(chunk_size), b""):
88+
hash_object.update(chunk)
89+
return hash_object.hexdigest()
90+
91+
8192
def download_and_get_hash(
8293
url,
8394
file_path,

functions-python/tasks_executor/src/tasks/dataset_files/rebuild_missing_dataset_files.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
import logging
22
import os
3-
import uuid
4-
import hashlib
53
import shutil
4+
import ssl
65
import tempfile
7-
import zipfile
86
import urllib.request
9-
import ssl
7+
import uuid
8+
import zipfile
109

1110
from google.cloud import storage
11+
1212
from shared.database.database import with_db_session
1313
from shared.database_gen.sqlacodegen_models import Gtfsdataset, Gtfsfile
14+
from shared.helpers.utils import get_hash_from_file
1415

1516
# Disable SSL verification — trusted internal sources only
1617
ssl._create_default_https_context = ssl._create_unverified_context
@@ -115,15 +116,12 @@ def process_dataset(dataset: Gtfsdataset):
115116
blob.upload_from_filename(file_path)
116117
blob.make_public()
117118

118-
with open(file_path, "rb") as f:
119-
sha256_hash = hashlib.sha256(f.read()).hexdigest()
120-
121119
gtfs_files.append(
122120
Gtfsfile(
123121
id=str(uuid.uuid4()),
124122
file_name=file_name,
125123
file_size_bytes=os.path.getsize(file_path),
126-
hash=sha256_hash,
124+
hash=get_hash_from_file(file_path),
127125
hosted_url=blob.public_url if dataset.latest else None,
128126
)
129127
)

functions-python/tasks_executor/tests/tasks/dataset_files/test_rebuild_missing_dataset_files.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def test_process_dataset_latest(
115115
urlopen_mock.return_value.__enter__.return_value = mock_response
116116

117117
mock_file = MagicMock()
118-
mock_file.read.return_value = b"file content"
118+
mock_file.read.side_effect = [b"chunk1", b"chunk2", b""] # ends properly
119119
open_mock.return_value.__enter__.return_value = mock_file
120120

121121
mock_blob = MagicMock()
@@ -159,7 +159,7 @@ def test_process_dataset_not_latest(
159159
urlopen_mock.return_value.__enter__.return_value = mock_response
160160

161161
mock_file = MagicMock()
162-
mock_file.read.return_value = b"file content"
162+
mock_file.read.side_effect = [b"chunk1", b"chunk2", b""] # ends properly
163163
open_mock.return_value.__enter__.return_value = mock_file
164164

165165
mock_blob = MagicMock()

0 commit comments

Comments
 (0)