MobilityData
diff --git a/‎functions-python/batch_datasets/README.md‎
Lines changed: 8 additions & 0 deletions b/‎functions-python/batch_datasets/README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎functions-python/batch_datasets/src/main.py‎
Lines changed: 15 additions & 4 deletions b/‎functions-python/batch_datasets/src/main.py‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎functions-python/batch_process_dataset/src/main.py‎
Lines changed: 110 additions & 50 deletions b/‎functions-python/batch_process_dataset/src/main.py‎
Lines changed: 110 additions & 50 deletions
diff --git a/‎functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py‎
Lines changed: 3 additions & 3 deletions b/‎functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py‎
Lines changed: 3 additions & 3 deletions
@@ -1,5 +1,12 @@
 # Batch Datasets
 This directory contains the GCP serverless function that enqueue all active feeds to download datasets.
+The function accepts an option request body to limit the feeds to process, otherwise it processes all active feeds:
+```json
+{
+    "feed_stable_ids": ["feed_id_1", "feed_id_2"]
+}
+```
+
 The function publish one Pub/Sub message per active feed with the following format:
 ```json
     {
@@ -19,6 +26,7 @@ The function publish one Pub/Sub message per active feed with the following form
         }
     }
 ``` 
+# TODO - Update with current behavior
 
 # Function configuration
 The function is configured using the following environment variables:
 
@@ -19,6 +19,7 @@
 import os
 import uuid
 from datetime import datetime
+from typing import Optional
 
 import functions_framework
 from google.cloud import pubsub_v1
@@ -64,7 +65,7 @@ def publish(publisher: PublisherClient, topic_path: str, data_bytes: bytes) -> F
     return publisher.publish(topic_path, data=data_bytes)
 
 
-def get_non_deprecated_feeds(session: Session):
+def get_non_deprecated_feeds(session: Session, feed_stable_ids: Optional[list[str]] = None):
     """
     Returns a list of non deprecated feeds
     :return: list of feeds
@@ -79,14 +80,17 @@ def get_non_deprecated_feeds(session: Session):
             Gtfsfeed.authentication_info_url,
             Gtfsfeed.api_key_parameter_name,
             Gtfsfeed.status,
-            Gtfsdataset.id.label("dataset_id"),
+            Gtfsdataset.stable_id.label("dataset_stable_id"),
             Gtfsdataset.hash.label("dataset_hash"),
         )
         .select_from(Gtfsfeed)
         .outerjoin(Gtfsdataset, (Gtfsdataset.feed_id == Gtfsfeed.id))
         .filter(Gtfsfeed.status != "deprecated")
         .filter(or_(Gtfsdataset.id.is_(None), Gtfsdataset.latest.is_(True)))
     )
+    if feed_stable_ids:
+        # If feed_stable_ids are provided, filter the query by stable IDs
+        query = query.filter(Gtfsfeed.stable_id.in_(feed_stable_ids))
     # Limit the query to 10 feeds (or FEEDS_LIMIT param) for testing purposes and lower environments
     if os.getenv("ENVIRONMENT", "").lower() in ("dev", "test", "qa"):
         limit = os.getenv("FEEDS_LIMIT")
@@ -108,8 +112,15 @@ def batch_datasets(request, db_session: Session):
     :param db_session: database session object
     :return: HTTP response object
     """
+    feed_stable_ids = None
     try:
-        feeds = get_non_deprecated_feeds(db_session)
+        request_json = request.get_json()
+        feed_stable_ids = request_json.get("feed_stable_ids") if request_json else None
+    except Exception:
+        logging.info(f"No feed_stable_ids provided in the request, processing all feeds.")
+
+    try:
+        feeds = get_non_deprecated_feeds(db_session, feed_stable_ids=feed_stable_ids)
     except Exception as error:
         logging.error(f"Error retrieving feeds: {error}")
         raise Exception(f"Error retrieving feeds: {error}")
@@ -130,7 +141,7 @@ def batch_datasets(request, db_session: Session):
             "producer_url": feed.producer_url,
             "feed_stable_id": feed.stable_id,
             "feed_id": feed.feed_id,
-            "dataset_id": feed.dataset_id,
+            "dataset_stable_id": feed.dataset_stable_id,
             "dataset_hash": feed.dataset_hash,
             "authentication_type": feed.authentication_type,
             "authentication_info_url": feed.authentication_info_url,
 
@@ -16,6 +16,7 @@
 
 import base64
 import json
+import logging
 import os
 import random
 import uuid
@@ -28,17 +29,14 @@
 from cloudevents.http import CloudEvent
 from google.cloud import storage
 from sqlalchemy import func
+from sqlalchemy.orm import Session
+
 from shared.common.gcp_utils import create_refresh_materialized_view_task
+from shared.database.database import with_db_session
 from shared.database_gen.sqlacodegen_models import Gtfsdataset, Gtfsfile
-
 from shared.dataset_service.main import DatasetTraceService, DatasetTrace, Status
-from shared.database.database import with_db_session
-import logging
-
 from shared.helpers.logger import init_logger, get_logger
-from shared.helpers.utils import download_and_get_hash, get_hash_from_file
-from sqlalchemy.orm import Session
-
+from shared.helpers.utils import download_and_get_hash, get_hash_from_file, download_from_gcs
 
 init_logger()
 
@@ -68,6 +66,7 @@ def __init__(
         authentication_type,
         api_key_parameter_name,
         public_hosted_datasets_url,
+        dataset_stable_id
     ):
         self.logger = get_logger(DatasetProcessor.__name__, feed_stable_id)
         self.producer_url = producer_url
@@ -92,6 +91,7 @@ def __init__(
 
         self.init_status = None
         self.init_status_additional_data = None
+        self.dataset_stable_id = dataset_stable_id
 
     @staticmethod
     def get_feed_credentials(feed_stable_id) -> str | None:
@@ -132,28 +132,30 @@ def download_content(self, temporary_file_path):
         is_zip = zipfile.is_zipfile(temporary_file_path)
         return file_hash, is_zip
 
-    def upload_file_to_storage(
+    def upload_files_to_storage(
         self,
         source_file_path,
         dataset_stable_id,
         extracted_files_path,
         public=True,
+        skip_dataset_upload=False
     ):
         """
-        Uploads a file to the GCP bucket
+        Uploads the dataset file and extracted files to GCP storage
         """
         bucket = storage.Client().get_bucket(self.bucket_name)
         target_paths = [
             f"{self.feed_stable_id}/latest.zip",
             f"{self.feed_stable_id}/{dataset_stable_id}/{dataset_stable_id}.zip",
         ]
         blob = None
-        for target_path in target_paths:
-            blob = bucket.blob(target_path)
-            with open(source_file_path, "rb") as file:
-                blob.upload_from_file(file)
-            if public:
-                blob.make_public()
+        if not skip_dataset_upload:
+            for target_path in target_paths:
+                blob = bucket.blob(target_path)
+                blob.upload_from_filename(source_file_path)
+                if public:
+                    blob.make_public()
+                self.logger.info(f"Uploaded {blob.public_url}")
 
         base_path, _ = os.path.splitext(source_file_path)
         extracted_files: List[Gtfsfile] = []
@@ -162,6 +164,7 @@ def upload_file_to_storage(
                 f"Extracted files path {extracted_files_path} does not exist."
             )
             return blob, extracted_files
+        self.logger.info('Processing extracted files from %s', extracted_files_path)
         for file_name in os.listdir(extracted_files_path):
             file_path = os.path.join(extracted_files_path, file_name)
             if os.path.isfile(file_path):
@@ -192,6 +195,7 @@ def upload_dataset(self, public=True) -> DatasetFile or None:
         if the dataset hash is different from the latest dataset stored
         :return: the file hash and the hosted url as a tuple or None if no upload is required
         """
+        temp_file_path = None
         try:
             self.logger.info("Accessing URL %s", self.producer_url)
             temp_file_path = self.generate_temp_filename()
@@ -221,11 +225,8 @@ def upload_dataset(self, public=True) -> DatasetFile or None:
                 dataset_full_path = (
                     f"{self.feed_stable_id}/{dataset_stable_id}/{dataset_stable_id}.zip"
                 )
-                self.logger.info(
-                    f"Creating file: {dataset_full_path}"
-                    f" in bucket {self.bucket_name}"
-                )
-                _, extracted_files = self.upload_file_to_storage(
+                self.logger.info(f"Creating file {dataset_full_path} in bucket {self.bucket_name}")
+                _, extracted_files = self.upload_files_to_storage(
                     temp_file_path,
                     dataset_stable_id,
                     extracted_files_path,
@@ -249,10 +250,55 @@ def upload_dataset(self, public=True) -> DatasetFile or None:
                 f"-> {file_sha256_hash}). Not uploading it."
             )
         finally:
-            if os.path.exists(temp_file_path):
+            if temp_file_path and os.path.exists(temp_file_path):
                 os.remove(temp_file_path)
         return None
 
+    def process2(self, public=True) -> DatasetFile or None:
+        """
+        Uploads a dataset to a GCP bucket as <feed_stable_id>/latest.zip and
+        <feed_stable_id>/<feed_stable_id>-<upload_datetime>.zip
+        if the dataset hash is different from the latest dataset stored
+        :return: the file hash and the hosted url as a tuple or None if no upload is required
+        """
+        temp_file_path = None
+        try:
+            self.logger.info("Accessing URL %s", self.producer_url)
+            temp_file_path = self.generate_temp_filename()
+            blob_file_path = f"{self.feed_stable_id}/latest.zip"
+            download_from_gcs(os.getenv('DATASETS_BUCKET_NAME'), blob_file_path, temp_file_path)
+
+            extracted_files_path = self.unzip_files(temp_file_path)
+            dataset_full_path = (
+                f"{self.feed_stable_id}/{self.dataset_stable_id}/{self.dataset_stable_id}.zip"
+            )
+            self.logger.info(f"Creating file {dataset_full_path} in bucket {self.bucket_name}")
+            _, extracted_files = self.upload_files_to_storage(
+                temp_file_path,
+                self.dataset_stable_id,
+                extracted_files_path,
+                public=public,
+                skip_dataset_upload=True,  # Skip the upload of the dataset file
+            )
+
+            dataset_file = DatasetFile(
+                stable_id=self.dataset_stable_id,
+                file_sha256_hash=self.latest_hash,
+                hosted_url=f"{self.public_hosted_datasets_url}/{dataset_full_path}",
+                extracted_files=extracted_files,
+                zipped_size=(
+                    os.path.getsize(temp_file_path)
+                    if os.path.exists(temp_file_path)
+                    else None
+                ),
+            )
+            self.create_dataset_entities(dataset_file, skip_dataset_creation=True)
+        finally:
+            if temp_file_path and os.path.exists(temp_file_path):
+                os.remove(temp_file_path)
+        return None
+
+
     def unzip_files(self, temp_file_path):
         extracted_files_path = os.path.join(temp_file_path.split(".")[0], "extracted")
         self.logger.info(f"Unzipping files to {extracted_files_path}")
@@ -270,14 +316,14 @@ def generate_temp_filename(self):
         Generates a temporary filename
         """
         temporary_file_path = (
-            f"/tmp/{self.feed_stable_id}-{random.randint(0, 1000000)}.zip"
+            f"/in-memory/{self.feed_stable_id}-{random.randint(0, 1000000)}.zip"
         )
         return temporary_file_path
 
     @with_db_session
-    def create_dataset(self, dataset_file: DatasetFile, db_session: Session):
+    def create_dataset_entities(self, dataset_file: DatasetFile, db_session: Session, skip_dataset_creation=False):
         """
-        Creates a new dataset in the database
+        Creates dataset entities in the database
         """
         try:
             # Check latest version of the dataset
@@ -294,30 +340,40 @@ def create_dataset(self, dataset_file: DatasetFile, db_session: Session):
             self.logger.info(
                 f"[{self.feed_stable_id}] Creating new dataset for feed with stable id {dataset_file.stable_id}."
             )
-            new_dataset = Gtfsdataset(
-                id=str(uuid.uuid4()),
-                feed_id=self.feed_id,
-                stable_id=dataset_file.stable_id,
-                latest=True,
-                bounding_box=None,
-                note=None,
-                hash=dataset_file.file_sha256_hash,
-                downloaded_at=func.now(),
-                hosted_url=dataset_file.hosted_url,
-                gtfsfiles=(
-                    dataset_file.extracted_files if dataset_file.extracted_files else []
-                ),
-                zipped_size_bytes=dataset_file.zipped_size,
-                unzipped_size_bytes=(
-                    sum([ex.file_size_bytes for ex in dataset_file.extracted_files])
-                    if dataset_file.extracted_files
-                    else None
-                ),
-            )
-            if latest_dataset:
+            if not skip_dataset_creation:
+                dataset = Gtfsdataset(
+                    id=str(uuid.uuid4()),
+                    feed_id=self.feed_id,
+                    stable_id=dataset_file.stable_id,
+                    latest=True,
+                    bounding_box=None,
+                    note=None,
+                    hash=dataset_file.file_sha256_hash,
+                    downloaded_at=func.now(),
+                    hosted_url=dataset_file.hosted_url,
+                    gtfsfiles=(
+                        dataset_file.extracted_files if dataset_file.extracted_files else []
+                    ),
+                    zipped_size_bytes=dataset_file.zipped_size,
+                    unzipped_size_bytes=(
+                        sum([ex.file_size_bytes for ex in dataset_file.extracted_files])
+                        if dataset_file.extracted_files
+                        else None
+                    ),
+                )
+                db_session.add(dataset)
+            elif skip_dataset_creation and latest_dataset:
+                latest_dataset.gtfsfiles = dataset_file.extracted_files if dataset_file.extracted_files else []
+                latest_dataset.zipped_size_bytes = dataset_file.zipped_size
+                latest_dataset.unzipped_size_bytes = (
+                        sum([ex.file_size_bytes for ex in dataset_file.extracted_files])
+                        if dataset_file.extracted_files
+                        else None
+                    )
+
+            if latest_dataset and not skip_dataset_creation:
                 latest_dataset.latest = False
                 db_session.add(latest_dataset)
-            db_session.add(new_dataset)
             db_session.commit()
             self.logger.info(f"[{self.feed_stable_id}] Dataset created successfully.")
 
@@ -335,7 +391,7 @@ def process(self) -> DatasetFile or None:
         if dataset_file is None:
             self.logger.info(f"[{self.feed_stable_id}] No database update required.")
             return None
-        self.create_dataset(dataset_file)
+        self.create_dataset_entities(dataset_file)
         return dataset_file
 
 
@@ -374,7 +430,7 @@ def process_dataset(cloud_event: CloudEvent):
                 producer_url,
                 feed_stable_id,
                 feed_id,
-                dataset_id,
+                dataset_stable_id,
                 dataset_hash,
                 authentication_type,
                 authentication_info_url,
@@ -409,7 +465,7 @@ def process_dataset(cloud_event: CloudEvent):
         trace_service = None
         dataset_file: DatasetFile = None
         error_message = None
-        #  Extract  data from message
+        #  Extract data from message
         data = base64.b64decode(cloud_event.data["message"]["data"]).decode()
         json_payload = json.loads(data)
         stable_id = json_payload["feed_stable_id"]
@@ -445,8 +501,12 @@ def process_dataset(cloud_event: CloudEvent):
             int(json_payload["authentication_type"]),
             json_payload["api_key_parameter_name"],
             public_hosted_datasets_url,
+            json_payload.get('dataset_stable_id')
         )
-        dataset_file = processor.process()
+        if json_payload.get("process_files_only", False):
+            dataset_file = processor.process2()
+        else:
+            dataset_file = processor.process()
     except Exception as e:
         # This makes sure the logger is initialized
         logger = get_logger("process_dataset", stable_id if stable_id else "UNKNOWN")
 
@@ -207,7 +207,7 @@ def test_upload_file_to_storage(self):
                 test_hosted_public_url,
             )
             dataset_id = faker.Faker().uuid4()
-            result, _ = processor.upload_file_to_storage(
+            result, _ = processor.upload_files_to_storage(
                 source_file_path, dataset_id, extracted_file_path
             )
             self.assertEqual(result.public_url, public_url)
@@ -358,11 +358,11 @@ def test_process_no_change(self):
         )
 
         processor.upload_dataset = MagicMock(return_value=None)
-        processor.create_dataset = MagicMock()
+        processor.create_dataset_entities = MagicMock()
         result = processor.process()
 
         self.assertIsNone(result)
-        processor.create_dataset.assert_not_called()
+        processor.create_dataset_entities.assert_not_called()
 
     @patch("main.DatasetTraceService")
     @patch("main.DatasetProcessor")
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,12 @@`
`1`	`1`	`# Batch Datasets`
`2`	`2`	`This directory contains the GCP serverless function that enqueue all active feeds to download datasets.`
	`3`	`+The function accepts an option request body to limit the feeds to process, otherwise it processes all active feeds:`
	`4`	+```json
	`5`	`+{`
	`6`	`+ "feed_stable_ids": ["feed_id_1", "feed_id_2"]`
	`7`	`+}`
	`8`	+```
	`9`	`+`
`3`	`10`	`The function publish one Pub/Sub message per active feed with the following format:`
`4`	`11`	```json
`5`	`12`	`{`
`@@ -19,6 +26,7 @@ The function publish one Pub/Sub message per active feed with the following form`
`19`	`26`	`}`
`20`	`27`	`}`
`21`	`28`	```
	`29`	`+# TODO - Update with current behavior`
`22`	`30`
`23`	`31`	`# Function configuration`
`24`	`32`	`The function is configured using the following environment variables:`
Original file line number	Diff line number	Diff line change
`@@ -207,7 +207,7 @@ def test_upload_file_to_storage(self):`
`207`	`207`	`test_hosted_public_url,`
`208`	`208`	`)`
`209`	`209`	`dataset_id = faker.Faker().uuid4()`
`210`		`- result, _ = processor.upload_file_to_storage(`
	`210`	`+ result, _ = processor.upload_files_to_storage(`
`211`	`211`	`source_file_path, dataset_id, extracted_file_path`
`212`	`212`	`)`
`213`	`213`	`self.assertEqual(result.public_url, public_url)`
`@@ -358,11 +358,11 @@ def test_process_no_change(self):`
`358`	`358`	`)`
`359`	`359`
`360`	`360`	`processor.upload_dataset = MagicMock(return_value=None)`
`361`		`- processor.create_dataset = MagicMock()`
	`361`	`+ processor.create_dataset_entities = MagicMock()`
`362`	`362`	`result = processor.process()`
`363`	`363`
`364`	`364`	`self.assertIsNone(result)`
`365`		`- processor.create_dataset.assert_not_called()`
	`365`	`+ processor.create_dataset_entities.assert_not_called()`
`366`	`366`
`367`	`367`	`@patch("main.DatasetTraceService")`
`368`	`368`	`@patch("main.DatasetProcessor")`