MobilityData
diff --git a/‎functions-python/batch_process_dataset/src/main.py‎
Lines changed: 4 additions & 2 deletions b/‎functions-python/batch_process_dataset/src/main.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎functions-python/helpers/utils.py‎
Lines changed: 17 additions & 0 deletions b/‎functions-python/helpers/utils.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎functions-python/tasks_executor/README.md‎
Lines changed: 3 additions & 4 deletions b/‎functions-python/tasks_executor/README.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎functions-python/tasks_executor/function_config.json‎
Lines changed: 4 additions & 0 deletions b/‎functions-python/tasks_executor/function_config.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎functions-python/tasks_executor/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎functions-python/tasks_executor/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎functions-python/tasks_executor/src/main.py‎
Lines changed: 7 additions & 0 deletions b/‎functions-python/tasks_executor/src/main.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎functions-python/tasks_executor/src/tasks/dataset_files/README.md‎
Lines changed: 72 additions & 0 deletions b/‎functions-python/tasks_executor/src/tasks/dataset_files/README.md‎
Lines changed: 72 additions & 0 deletions
@@ -28,7 +28,6 @@
 from cloudevents.http import CloudEvent
 from google.cloud import storage
 from sqlalchemy import func
-
 from shared.common.gcp_utils import create_refresh_materialized_view_task
 from shared.database_gen.sqlacodegen_models import Gtfsdataset, Gtfsfile
 
@@ -37,9 +36,10 @@
 import logging
 
 from shared.helpers.logger import init_logger, get_logger
-from shared.helpers.utils import download_and_get_hash
+from shared.helpers.utils import download_and_get_hash, get_hash_from_file
 from sqlalchemy.orm import Session
 
+
 init_logger()
 
 
@@ -179,6 +179,8 @@ def upload_file_to_storage(
                         id=str(uuid.uuid4()),
                         file_name=file_name,
                         file_size_bytes=os.path.getsize(file_path),
+                        hosted_url=file_blob.public_url if public else None,
+                        hash=get_hash_from_file(file_path),
                     )
                 )
         return blob, extracted_files
 
@@ -16,6 +16,7 @@
 import hashlib
 import logging
 import os
+import ssl
 
 import requests
 import urllib3
@@ -78,6 +79,17 @@ def download_url_content(url, with_retry=False):
         raise e
 
 
+def get_hash_from_file(file_path, hash_algorithm="sha256", chunk_size=8192):
+    """
+    Returns the hash of a file
+    """
+    hash_object = hashlib.new(hash_algorithm)
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(chunk_size), b""):
+            hash_object.update(chunk)
+    return hash_object.hexdigest()
+
+
 def download_and_get_hash(
     url,
     file_path,
@@ -87,6 +99,7 @@ def download_and_get_hash(
     api_key_parameter_name=None,
     credentials=None,
     logger=None,
+    trusted_certs=False,  # If True, disables SSL verification
 ):
     """
     Downloads the content of a URL and stores it in a file and returns the hash of the file
@@ -117,6 +130,10 @@ def download_and_get_hash(
         if authentication_type == 2 and api_key_parameter_name and credentials:
             headers[api_key_parameter_name] = credentials
 
+        if trusted_certs:
+            ctx.check_hostname = False
+            ctx.verify_mode = ssl.CERT_NONE
+
         with urllib3.PoolManager(ssl_context=ctx) as http:
             with http.request(
                 "GET", url, preload_content=False, headers=headers, redirect=True
 
@@ -24,6 +24,8 @@ Example:
     "filter_statuses": ["active", "inactive", "future"]
   }
 }
+```
+```json
 {
    "task": "rebuild_missing_bounding_boxes",
    "payload": {
@@ -40,12 +42,9 @@ Example:
 ```
 
 To get the list of supported tasks use:
-``
+```json
 {
 "name": "list_tasks",
 "payload": {}
 }
-
-```
-
 ```
@@ -11,6 +11,10 @@
   "secret_environment_variables": [
     {
       "key": "FEEDS_DATABASE_URL"
+    },
+    {
+      "key": "FEEDS_CREDENTIALS",
+      "secret": "FEEDS_CREDENTIALS"
     }
   ],
   "ingress_settings": "ALLOW_ALL",
 
@@ -22,6 +22,7 @@ google-cloud-workflows
 google-cloud-pubsub
 google-cloud-tasks
 flask
+google-cloud-storage
 
 # Configuration
 python-dotenv==1.0.0
@@ -22,6 +22,9 @@
 from tasks.refresh_feedsearch_view.refresh_materialized_view import (
     refresh_materialized_view_handler,
 )
+from tasks.dataset_files.rebuild_missing_dataset_files import (
+    rebuild_missing_dataset_files_handler,
+)
 from tasks.validation_reports.rebuild_missing_validation_reports import (
     rebuild_missing_validation_reports_handler,
 )
@@ -56,6 +59,10 @@
         "description": "Refreshes the materialized view.",
         "handler": refresh_materialized_view_handler,
     },
+    "rebuild_missing_dataset_files": {
+        "description": "Rebuilds missing dataset files for GTFS datasets.",
+        "handler": rebuild_missing_dataset_files_handler,
+    },
 }
 
 
 
@@ -0,0 +1,72 @@
+# Rebuild Missing Dataset Files
+
+This task rebuilds missing extracted files in GTFS datasets.
+It downloads datasets from their `hosted_url`, extracts all files, computes zipped and unzipped sizes, calculates hashes, uploads the files to a GCS bucket, and updates the database.
+
+---
+
+## Task ID
+
+Use task ID: `rebuild_missing_dataset_files`
+
+---
+
+## Usage
+
+The function accepts the following payload:
+
+```json
+{
+  "dry_run": true,                // [optional] If true, do not upload or modify the database (default: true)
+  "after_date": "YYYY-MM-DD",     // [optional] Only include datasets downloaded after this ISO date
+  "latest_only": true             // [optional] If true, only process the latest version of each dataset (default: true)
+}
+```
+
+### Example:
+
+```json
+{
+  "dry_run": false,
+  "after_date": "2025-07-01",
+  "latest_only": true
+}
+```
+
+---
+
+## What It Does
+
+For each GTFS dataset with missing file information (missing zipped/unzipped sizes or missing extracted files), this function:
+
+1. Downloads the `.zip` file from its `hosted_url`
+2. Computes the zipped size in bytes
+3. Extracts all GTFS files locally
+4. Computes the unzipped size in bytes
+5. Uploads each extracted file to a GCS bucket with the structure:
+
+   ```
+   {feed-stable-id}/{dataset-stable-id}/extracted/{file_name}
+   ```
+6. Makes each file publicly accessible and stores its GCS URL
+7. Computes SHA256 hashes for each file
+8. Stores metadata in the `Gtfsfile` table for later use
+
+---
+
+## GCP Environment Variables
+
+The function requires the following environment variables:
+
+| Variable               | Description                                                                  |
+| ---------------------- | ---------------------------------------------------------------------------- |
+| `DATASETS_BUCKET_NAME` | The name of the GCS bucket used to store extracted GTFS files                |
+
+---
+
+## Additional Notes
+
+* This function **disables SSL verification** when downloading files, as the sources are trusted internally.
+* Commits to the database occur in batches of 5 datasets to improve performance and avoid large transaction blocks.
+* If `dry_run` is enabled, no downloads, uploads, or DB modifications are performed. Only the number of affected datasets is logged.
+* The function is safe to rerun. It will only affect datasets missing required file metadata.
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,8 @@ Example:`
`24`	`24`	`"filter_statuses": ["active", "inactive", "future"]`
`25`	`25`	`}`
`26`	`26`	`}`
	`27`	+```
	`28`	+```json
`27`	`29`	`{`
`28`	`30`	`"task": "rebuild_missing_bounding_boxes",`
`29`	`31`	`"payload": {`
`@@ -40,12 +42,9 @@ Example:`
`40`	`42`	```
`41`	`43`
`42`	`44`	`To get the list of supported tasks use:`
`43`		-``
	`45`	+```json
`44`	`46`	`{`
`45`	`47`	`"name": "list_tasks",`
`46`	`48`	`"payload": {}`
`47`	`49`	`}`
`48`		`-`
`49`		-```
`50`		`-`
`51`	`50`	```
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,10 @@`
`11`	`11`	`"secret_environment_variables": [`
`12`	`12`	`{`
`13`	`13`	`"key": "FEEDS_DATABASE_URL"`
	`14`	`+ },`
	`15`	`+ {`
	`16`	`+ "key": "FEEDS_CREDENTIALS",`
	`17`	`+ "secret": "FEEDS_CREDENTIALS"`
`14`	`18`	`}`
`15`	`19`	`],`
`16`	`20`	`"ingress_settings": "ALLOW_ALL",`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,9 @@`
`22`	`22`	`from tasks.refresh_feedsearch_view.refresh_materialized_view import (`
`23`	`23`	`refresh_materialized_view_handler,`
`24`	`24`	`)`
	`25`	`+from tasks.dataset_files.rebuild_missing_dataset_files import (`
	`26`	`+ rebuild_missing_dataset_files_handler,`
	`27`	`+)`
`25`	`28`	`from tasks.validation_reports.rebuild_missing_validation_reports import (`
`26`	`29`	`rebuild_missing_validation_reports_handler,`
`27`	`30`	`)`
`@@ -56,6 +59,10 @@`
`56`	`59`	`"description": "Refreshes the materialized view.",`
`57`	`60`	`"handler": refresh_materialized_view_handler,`
`58`	`61`	`},`
	`62`	`+ "rebuild_missing_dataset_files": {`
	`63`	`+ "description": "Rebuilds missing dataset files for GTFS datasets.",`
	`64`	`+ "handler": rebuild_missing_dataset_files_handler,`
	`65`	`+ },`
`59`	`66`	`}`
`60`	`67`
`61`	`68`