MobilityData
diff --git a/‎api/src/shared/common/gcp_utils.py‎
Lines changed: 4 additions & 0 deletions b/‎api/src/shared/common/gcp_utils.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎functions-python/batch_process_dataset/src/main.py‎
Lines changed: 15 additions & 4 deletions b/‎functions-python/batch_process_dataset/src/main.py‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎functions-python/batch_process_dataset/src/pipeline_tasks.py‎
Lines changed: 164 additions & 0 deletions b/‎functions-python/batch_process_dataset/src/pipeline_tasks.py‎
Lines changed: 164 additions & 0 deletions
diff --git a/‎functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py‎
Lines changed: 2 additions & 0 deletions b/‎functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py‎
Lines changed: 2 additions & 0 deletions
@@ -79,8 +79,11 @@ def create_http_task_with_name(
     task_name: str,
     task_time: Timestamp,
     http_method: "tasks_v2.HttpMethod",
+    timeout_s: int = 1800,  # 30 minutes
 ):
     """Creates a GCP Cloud Task."""
+    from google.protobuf import duration_pb2
+
     token = tasks_v2.OidcToken(service_account_email=os.getenv("SERVICE_ACCOUNT_EMAIL"))
 
     parent = client.queue_path(project_id, gcp_region, queue_name)
@@ -98,6 +101,7 @@ def create_http_task_with_name(
             body=body,
             headers={"Content-Type": "application/json"},
         ),
+        dispatch_deadline=duration_pb2.Duration(seconds=timeout_s),
     )
     try:
         response = client.create_task(parent=parent, task=task)
 
@@ -41,6 +41,7 @@
     get_hash_from_file,
     download_from_gcs,
 )
+from pipeline_tasks import create_pipeline_tasks
 
 init_logger()
 
@@ -260,7 +261,10 @@ def upload_dataset(self, public=True) -> DatasetFile or None:
                 os.remove(temp_file_path)
         return None
 
-    def process_from_bucket_latest(self, public=True) -> DatasetFile or None:
+    @with_db_session
+    def process_from_bucket_latest(
+        self, db_session, public=True
+    ) -> DatasetFile or None:
         """
         Uploads a dataset to a GCP bucket as <feed_stable_id>/latest.zip and
         <feed_stable_id>/<feed_stable_id>-<upload_datetime>.zip
@@ -300,7 +304,10 @@ def process_from_bucket_latest(self, public=True) -> DatasetFile or None:
                     else None
                 ),
             )
-            self.create_dataset_entities(dataset_file, skip_dataset_creation=True)
+            dataset = self.create_dataset_entities(
+                dataset_file, skip_dataset_creation=True, db_session=db_session
+            )
+            create_pipeline_tasks(dataset)
         finally:
             if temp_file_path and os.path.exists(temp_file_path):
                 os.remove(temp_file_path)
@@ -352,6 +359,7 @@ def create_dataset_entities(
             self.logger.info(
                 f"[{self.feed_stable_id}] Creating new dataset for feed with stable id {dataset_file.stable_id}."
             )
+            dataset = None
             if not skip_dataset_creation:
                 dataset = Gtfsdataset(
                     id=str(uuid.uuid4()),
@@ -394,10 +402,12 @@ def create_dataset_entities(
             self.logger.info(f"[{self.feed_stable_id}] Dataset created successfully.")
 
             create_refresh_materialized_view_task()
+            return latest_dataset if skip_dataset_creation else dataset
         except Exception as e:
             raise Exception(f"Error creating dataset: {e}")
 
-    def process_from_producer_url(self) -> DatasetFile or None:
+    @with_db_session
+    def process_from_producer_url(self, db_session) -> DatasetFile or None:
         """
         Process the dataset and store new version in GCP bucket if any changes are detected
         :return: the file hash and the hosted url as a tuple or None if no upload is required
@@ -407,7 +417,8 @@ def process_from_producer_url(self) -> DatasetFile or None:
         if dataset_file is None:
             self.logger.info(f"[{self.feed_stable_id}] No database update required.")
             return None
-        self.create_dataset_entities(dataset_file)
+        dataset = self.create_dataset_entities(dataset_file, db_session=db_session)
+        create_pipeline_tasks(dataset)
         return dataset_file
 
 
 
@@ -0,0 +1,164 @@
+import json
+import logging
+import os
+from typing import Iterable, List
+
+from google.cloud import tasks_v2
+from sqlalchemy.orm import Session
+
+from shared.database.database import with_db_session
+from shared.database_gen.sqlacodegen_models import Gtfsdataset
+from shared.helpers.utils import create_http_task
+
+
+def create_http_reverse_geolocation_processor_task(
+    stable_id: str,
+    dataset_stable_id: str,
+    stops_url: str,
+) -> None:
+    """
+    Create a task to process reverse geolocation for a dataset.
+    """
+    client = tasks_v2.CloudTasksClient()
+    body = json.dumps(
+        {
+            "stable_id": stable_id,
+            "stops_url": stops_url,
+            "dataset_id": dataset_stable_id,
+        }
+    ).encode()
+    queue_name = os.getenv("REVERSE_GEOLOCATION_QUEUE")
+    project_id = os.getenv("PROJECT_ID")
+    gcp_region = os.getenv("GCP_REGION")
+
+    create_http_task(
+        client,
+        body,
+        f"https://{gcp_region}-{project_id}.cloudfunctions.net/reverse-geolocation-processor",
+        project_id,
+        gcp_region,
+        queue_name,
+    )
+
+
+def create_http_pmtiles_builder_task(
+    stable_id: str,
+    dataset_stable_id: str,
+) -> None:
+    """
+    Create a task to generate PMTiles for a dataset.
+    """
+    client = tasks_v2.CloudTasksClient()
+    body = json.dumps(
+        {"feed_stable_id": stable_id, "dataset_stable_id": dataset_stable_id}
+    ).encode()
+    queue_name = os.getenv("PMTILES_BUILDER_QUEUE")
+    project_id = os.getenv("PROJECT_ID")
+    gcp_region = os.getenv("GCP_REGION")
+    gcp_env = os.getenv("ENVIRONMENT")
+
+    create_http_task(
+        client,
+        body,
+        f"https://{gcp_region}-{project_id}.cloudfunctions.net/pmtiles-builder-{gcp_env}",
+        project_id,
+        gcp_region,
+        queue_name,
+    )
+
+
+@with_db_session
+def get_changed_files(
+    dataset: Gtfsdataset,
+    db_session: Session,
+) -> List[str]:
+    """
+    Return the subset of `file_names` whose content hash changed compared to the
+    previous dataset for the same feed.
+      - If there is no previous dataset → any file that exists in the new dataset is considered "changed".
+      - If the file existed before and now is missing → NOT considered changed.
+      - If the file did not exist before but exists now → considered changed.
+      - If hashes differ → considered changed.
+    """
+    previous_dataset = (
+        db_session.query(Gtfsdataset)
+        .filter(
+            Gtfsdataset.feed_id == dataset.feed_id,
+            Gtfsdataset.id != dataset.id,
+        )
+        .order_by(Gtfsdataset.downloaded_at.desc())
+        .first()
+    )
+
+    new_files = list(dataset.gtfsfiles)
+
+    # No previous dataset -> everything that exists now is "changed"
+    if not previous_dataset:
+        return [f.file_name for f in new_files]
+
+    prev_map = {
+        f.file_name: getattr(f, "hash", None) for f in previous_dataset.gtfsfiles
+    }
+
+    changed_files = []
+    for f in new_files:
+        new_hash = getattr(f, "hash", None)
+        old_hash = prev_map.get(f.file_name)
+
+        if old_hash is None or old_hash != new_hash:
+            changed_files.append(f)
+            logging.info(f"Changed file {f.file_name} from {old_hash} to {new_hash}")
+
+    return [f.file_name for f in changed_files]
+
+
+@with_db_session
+def create_pipeline_tasks(dataset: Gtfsdataset, db_session: Session) -> None:
+    """
+    Create pipeline tasks for a dataset.
+    """
+    changed_files = get_changed_files(dataset, db_session=db_session)
+
+    stable_id = dataset.feed.stable_id
+    dataset_stable_id = dataset.stable_id
+    gtfs_files = dataset.gtfsfiles
+    stops_file = next(
+        (file for file in gtfs_files if file.file_name == "stops.txt"), None
+    )
+    stops_url = stops_file.hosted_url if stops_file else None
+
+    # Create reverse geolocation task
+    if stops_url and "stops.txt" in changed_files:
+        create_http_reverse_geolocation_processor_task(
+            stable_id, dataset_stable_id, stops_url
+        )
+
+    routes_file = next(
+        (file for file in gtfs_files if file.file_name == "routes.txt"), None
+    )
+    # Create PMTiles builder task
+    required_files = {"stops.txt", "routes.txt", "trips.txt", "stop_times.txt"}
+    if not required_files.issubset(set(f.file_name for f in gtfs_files)):
+        logging.info(
+            f"Skipping PMTiles task for dataset {dataset_stable_id} due to missing required files. Required files: "
+            f"{required_files}, available files: {[f.file_name for f in gtfs_files]}"
+        )
+    expected_file_change: Iterable[str] = {
+        "stops.txt",
+        "trips.txt",
+        "routes.txt",
+        "stop_times.txt",
+        "shapes.txt",
+    }
+    if (
+        routes_file
+        and 0 < routes_file.file_size_bytes < 1_000_000
+        and not set(changed_files).isdisjoint(expected_file_change)
+    ):
+        create_http_pmtiles_builder_task(stable_id, dataset_stable_id)
+    elif routes_file:
+        logging.info(
+            f"Skipping PMTiles task for dataset {dataset_stable_id} due to constraints --> "
+            f"routes.txt file size : {routes_file.file_size_bytes} bytes"
+            f" and changed files: {changed_files}"
+        )
@@ -451,6 +451,7 @@ def test_process_dataset_missing_stable_id(self, mock_dataset_trace):
         )
 
     @patch.dict(os.environ, {"DATASETS_BUCKET_NAME": "test-bucket"})
+    @patch("main.create_pipeline_tasks")
     @patch("main.DatasetProcessor.create_dataset_entities")
     @patch("main.DatasetProcessor.upload_files_to_storage")
     @patch("main.DatasetProcessor.unzip_files")
@@ -461,6 +462,7 @@ def test_process_from_bucket_latest_happy_path(
         mock_unzip_files,
         mock_upload_files_to_storage,
         mock_create_dataset_entities,
+        _,
     ):
         # Arrange
         mock_blob = MagicMock()