Merge branch 'main' into 1172-refresh-feedsearch-view-asynchronically

qcdyx · web-flow · commit eaeaf1c3ef43 · 2025-08-05T20:40:48.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -77,4 +77,7 @@ coverage_reports
 tf.plan
 
 # CSV generation output files
-functions-python/**/*.csv
+functions-python/**/*.csv
+
+# Local emulators
+.cloudstorage
diff --git a/functions-python/batch_process_dataset/.coveragerc b/functions-python/batch_process_dataset/.coveragerc
@@ -5,6 +5,7 @@ omit =
     */database_gen/*
     */dataset_service/*
     */shared/*
+    */scripts/*
 
 [report]
 exclude_lines =
diff --git a/functions-python/batch_process_dataset/requirements_dev.txt b/functions-python/batch_process_dataset/requirements_dev.txt
@@ -2,4 +2,5 @@ Faker
 pytest~=7.4.3
 urllib3-mock
 requests-mock
-python-dotenv~=1.0.0
+python-dotenv~=1.0.0
+gcp-storage-emulator
diff --git a/functions-python/batch_process_dataset/src/main.py b/functions-python/batch_process_dataset/src/main.py
@@ -130,18 +130,15 @@ def download_content(self, temporary_file_path):
             logger=self.logger,
         )
         is_zip = zipfile.is_zipfile(temporary_file_path)
-        if is_zip:
-            extracted_file_path = os.path.join(
-                temporary_file_path.split(".")[0], "extracted"
-            )
-            with zipfile.ZipFile(temporary_file_path, "r") as zip_ref:
-                zip_ref.extractall(os.path.dirname(extracted_file_path))
-            # List all files in the extracted directory
-            extracted_files = os.listdir(os.path.dirname(extracted_file_path))
-            self.logger.info(f"Extracted files: {extracted_files}")
         return file_hash, is_zip
 
-    def upload_file_to_storage(self, source_file_path, dataset_stable_id):
+    def upload_file_to_storage(
+        self,
+        source_file_path,
+        dataset_stable_id,
+        extracted_files_path,
+        public=True,
+    ):
         """
         Uploads a file to the GCP bucket
         """
@@ -155,12 +152,12 @@ def upload_file_to_storage(self, source_file_path, dataset_stable_id):
             blob = bucket.blob(target_path)
             with open(source_file_path, "rb") as file:
                 blob.upload_from_file(file)
-            blob.make_public()
+            if public:
+                blob.make_public()
 
         base_path, _ = os.path.splitext(source_file_path)
-        extracted_files_path = os.path.join(base_path, "extracted")
         extracted_files: List[Gtfsfile] = []
-        if not os.path.exists(extracted_files_path):
+        if not extracted_files_path or not os.path.exists(extracted_files_path):
             self.logger.warning(
                 f"Extracted files path {extracted_files_path} does not exist."
             )
@@ -172,7 +169,8 @@ def upload_file_to_storage(self, source_file_path, dataset_stable_id):
                     f"{self.feed_stable_id}/{dataset_stable_id}/extracted/{file_name}"
                 )
                 file_blob.upload_from_filename(file_path)
-                file_blob.make_public()
+                if public:
+                    file_blob.make_public()
                 self.logger.info(
                     f"Uploaded extracted file {file_name} to {file_blob.public_url}"
                 )
@@ -185,7 +183,7 @@ def upload_file_to_storage(self, source_file_path, dataset_stable_id):
                 )
         return blob, extracted_files
 
-    def upload_dataset(self) -> DatasetFile or None:
+    def upload_dataset(self, public=True) -> DatasetFile or None:
         """
         Uploads a dataset to a GCP bucket as <feed_stable_id>/latest.zip and
         <feed_stable_id>/<feed_stable_id>-<upload_datetime>.zip
@@ -205,12 +203,12 @@ def upload_dataset(self) -> DatasetFile or None:
             self.logger.info(
                 f"[{self.feed_stable_id}] File hash is {file_sha256_hash}."
             )
-
             if self.latest_hash != file_sha256_hash:
                 self.logger.info(
                     f"[{self.feed_stable_id}] Dataset has changed (hash {self.latest_hash}"
                     f"-> {file_sha256_hash}). Uploading new version."
                 )
+                extracted_files_path = self.unzip_files(temp_file_path)
                 self.logger.info(
                     f"Creating file {self.feed_stable_id}/latest.zip in bucket {self.bucket_name}"
                 )
@@ -226,7 +224,10 @@ def upload_dataset(self) -> DatasetFile or None:
                     f" in bucket {self.bucket_name}"
                 )
                 _, extracted_files = self.upload_file_to_storage(
-                    temp_file_path, dataset_stable_id
+                    temp_file_path,
+                    dataset_stable_id,
+                    extracted_files_path,
+                    public=public,
                 )
 
                 return DatasetFile(
@@ -250,6 +251,18 @@ def upload_dataset(self) -> DatasetFile or None:
                 os.remove(temp_file_path)
         return None
 
+    def unzip_files(self, temp_file_path):
+        extracted_files_path = os.path.join(temp_file_path.split(".")[0], "extracted")
+        self.logger.info(f"Unzipping files to {extracted_files_path}")
+        # Create the directory for extracted files if it does not exist
+        os.makedirs(extracted_files_path, exist_ok=True)
+        with zipfile.ZipFile(temp_file_path, "r") as zip_ref:
+            zip_ref.extractall(path=extracted_files_path)
+        # List all files in the extracted directory
+        extracted_files = os.listdir(extracted_files_path)
+        self.logger.info(f"Extracted files: {extracted_files}")
+        return extracted_files_path
+
     def generate_temp_filename(self):
         """
         Generates a temporary filename
diff --git a/functions-python/batch_process_dataset/src/scripts/download_verifier.py b/functions-python/batch_process_dataset/src/scripts/download_verifier.py
@@ -0,0 +1,84 @@
+import logging
+import os
+
+from main import DatasetProcessor
+from gcp_storage_emulator.server import create_server
+
+HOST = "localhost"
+PORT = 9023
+BUCKET_NAME = "verifier"
+PRODUCER_URL = "https://example.com/dataset.zip"  # Replace with actual producer URL
+
+
+def verify_download_content(producer_url: str):
+    """
+    Verifies the download_content is able to retrieve the file
+    This is useful to simulate the download code locally and test issues related with user-agent and downloaded content.
+    Not supported authenticated feeds currently.
+    """
+    logging.info("Verifying downloaded content... (not implemented)")
+
+    logging.info(f"Producer URL: {producer_url}")
+
+    processor = DatasetProcessor(
+        producer_url=producer_url,
+        feed_id=None,
+        feed_stable_id=None,
+        execution_id=None,
+        latest_hash=None,
+        bucket_name=None,
+        authentication_type=0,
+        api_key_parameter_name=None,
+        public_hosted_datasets_url=None,
+    )
+    tempfile = processor.generate_temp_filename()
+    logging.info(f"Temp filename: {tempfile}")
+    file_hash, is_zip = processor.download_content(tempfile)
+    logging.info(f"File hash: {file_hash}")
+
+
+def verify_upload_dataset(producer_url: str):
+    """
+    Verifies the upload_dataset is able to upload the dataset to the GCP storage emulator.
+    This is useful to simulate the upload code locally and test issues related with user-agent and uploaded content.
+    This function also tests the DatasetProcessor class methods for generating a temporary filename
+    and uploading the dataset.
+    :param producer_url:
+    :return:
+    """
+    processor = DatasetProcessor(
+        producer_url=producer_url,
+        feed_id="feed_id",
+        feed_stable_id="feed_stable_id",
+        execution_id=None,
+        latest_hash="123",
+        bucket_name=BUCKET_NAME,
+        authentication_type=0,
+        api_key_parameter_name=None,
+        public_hosted_datasets_url=None,
+    )
+    tempfile = processor.generate_temp_filename()
+    logging.info(f"Temp filename: {tempfile}")
+    dataset_file = processor.upload_dataset(public=False)
+    logging.info(f"Dataset File: {dataset_file}")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    # Replace with actual producer URL
+    try:
+        os.environ["STORAGE_EMULATOR_HOST"] = f"http://{HOST}:{PORT}"
+        server = create_server(
+            host=HOST, port=PORT, in_memory=False, default_bucket=BUCKET_NAME
+        )
+        server.start()
+
+        verify_download_content(producer_url=PRODUCER_URL)
+        logging.info("Download content verification completed successfully.")
+        verify_upload_dataset(producer_url=PRODUCER_URL)
+        verify_upload_dataset(producer_url=PRODUCER_URL)
+    except Exception as e:
+        logging.error(f"Error verifying download content: {e}")
+    finally:
+        server.stop()
+        logging.info("Verification completed.")
diff --git a/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py b/functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py
@@ -46,8 +46,9 @@ def create_cloud_event(mock_data):
 class TestDatasetProcessor(unittest.TestCase):
     @patch("main.DatasetProcessor.upload_file_to_storage")
     @patch("main.DatasetProcessor.download_content")
+    @patch("main.DatasetProcessor.unzip_files")
     def test_upload_dataset_diff_hash(
-        self, mock_download_url_content, upload_file_to_storage
+        self, mock_unzip_files, mock_download_url_content, upload_file_to_storage
     ):
         """
         Test upload_dataset method of DatasetProcessor class with different hash from the latest one
@@ -57,6 +58,7 @@ def test_upload_dataset_diff_hash(
         mock_blob.path = public_url
         upload_file_to_storage.return_value = mock_blob, []
         mock_download_url_content.return_value = file_hash, True
+        mock_unzip_files.return_value = [mock_blob, mock_blob]
 
         processor = DatasetProcessor(
             public_url,
@@ -178,6 +180,7 @@ def test_upload_dataset_download_exception(
     def test_upload_file_to_storage(self):
         bucket_name = "test-bucket"
         source_file_path = "path/to/source/file"
+        extracted_file_path = "path/to/source/file"
 
         mock_blob = Mock()
         mock_blob.public_url = public_url
@@ -204,7 +207,9 @@ def test_upload_file_to_storage(self):
                 test_hosted_public_url,
             )
             dataset_id = faker.Faker().uuid4()
-            result, _ = processor.upload_file_to_storage(source_file_path, dataset_id)
+            result, _ = processor.upload_file_to_storage(
+                source_file_path, dataset_id, extracted_file_path
+            )
             self.assertEqual(result.public_url, public_url)
             mock_client.get_bucket.assert_called_with(bucket_name)
             mock_bucket.blob.assert_called_with(
diff --git a/functions-python/gbfs_validator/README.md b/functions-python/gbfs_validator/README.md
@@ -30,7 +30,7 @@ The message published by the batch function to the Pub/Sub topic follows this fo
 
 ### Functionality Details
 
-- **`gbfs-validator-batch`**: Triggered per execution ID, this function iterates over all GBFS feeds, preparing and publishing individual messages to the Pub/Sub topic.
+- **`gbfs-validator-batch`**: Triggered per execution ID, when the request is a POST request with a JSON body containing `feed_stable_ids`, it publishes events related to only those feeds. Otherwise, it publishes avents of all feeds to the Pub/Sub topic.
 - **`gbfs-validator-pubsub`**: Triggered per feed, this function performs the following steps:
   1. **Access the autodiscovery URL and update versions**: The function accesses the autodiscovery URL to update the **GBFSVersions** table.
   2. **Measure latency and validate the feed**: For each version, the function measures the response latency and validates the feed. The validation summary is stored in GCP, and the total error count is extracted and saved in the **GBFSValidationReport**.
@@ -46,6 +46,13 @@ The `gbfs-validator-batch` function requires the following environment variables
 - **`PROJECT_ID`**: The Google Cloud Project ID used to construct the full topic path.
 - **`FEEDS_DATABASE_URL`**: The database connection string for accessing the GBFS feeds.
 
+Optional request body parameters for the batch function:
+```json
+{
+    "feed_stable_ids": ["feed_id_1", "feed_id_2"]
+}
+```
+
 ### Pub/Sub Function Environment Variables
 
 The `gbfs-validator-pubsub` function requires the following environment variables:
diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py
@@ -101,9 +101,11 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent):
 
 @with_db_session
 @functions_framework.http
-def gbfs_validator_batch(_, db_session: Session):
+def gbfs_validator_batch(request, db_session: Session):
     """
-    HTTP Cloud Function to trigger the GBFS Validator function for multiple datasets.
+    HTTP Cloud  Function to trigger the GBFS Validator function for multiple datasets.
+    When the request is a POST request with a JSON body containing `feed_stable_ids`,
+    it processes only those feeds. Otherwise, it processes all feeds in the database.
     @param _: The request object.
     @return: The response of the function.
     """
@@ -113,9 +115,25 @@ def gbfs_validator_batch(_, db_session: Session):
         logging.error("PUBSUB_TOPIC_NAME environment variable not set.")
         return "PUBSUB_TOPIC_NAME environment variable not set.", 500
 
-    # Get all GBFS feeds from the database
     try:
-        gbfs_feeds = fetch_all_gbfs_feeds(db_session)
+        feed_stable_ids = None
+        if request and request.method == "POST" and request.is_json:
+            request_json = request.get_json()
+            feed_stable_ids = (
+                request_json.get("feed_stable_ids") if request_json else None
+            )
+        else:
+            logging.info("Request body not provided or not a valid JSON.")
+    except Exception as e:
+        logging.error("Error parsing request body: %s", e)
+        return "Invalid request body.", 400
+
+    try:
+        if feed_stable_ids:
+            gbfs_feeds = fetch_gbfs_feeds_by_stable_ids(db_session, feed_stable_ids)
+        else:
+            # Get all GBFS feeds from the database
+            gbfs_feeds = fetch_all_gbfs_feeds(db_session)
     except Exception:
         return "Error getting all GBFS feeds.", 500
 
@@ -150,3 +168,15 @@ def gbfs_validator_batch(_, db_session: Session):
         f"GBFS Validator batch function triggered successfully for {len(feeds_data)} feeds.",
         200,
     )
+
+
+def fetch_gbfs_feeds_by_stable_ids(db_session, feed_stable_ids):
+    """Fetch GBFS feeds by their IDs and not deprecated from the database"""
+    gbfs_feeds = (
+        db_session.query(Gbfsfeed)
+        .filter(
+            Gbfsfeed.stable_id.in_(feed_stable_ids), Gbfsfeed.status != "deprecated"
+        )
+        .all()
+    )
+    return gbfs_feeds
diff --git a/functions-python/gbfs_validator/tests/test_gbfs_validator.py b/functions-python/gbfs_validator/tests/test_gbfs_validator.py
@@ -75,7 +75,6 @@ def test_gbfs_validator_batch(
     ):
         # Prepare mocks
         mock_session = MagicMock()
-        # mock_database.return_value.start_db_session.return_value = mock_session
 
         mock_publisher = MagicMock()
         mock_publisher_client.return_value = mock_publisher
@@ -179,3 +178,41 @@ def test_gbfs_validator_batch_publish_exception(
         # Call the function
         result = gbfs_validator_batch(None)
         self.assertEqual(result[1], 500)
+
+    @patch.dict(
+        os.environ,
+        {
+            "PUBSUB_TOPIC_NAME": "mock-topic",
+        },
+    )
+    @patch("main.pubsub_v1.PublisherClient")
+    @patch("main.fetch_gbfs_feeds_by_stable_ids")
+    def test_gbfs_validator_batch_by_feed_stable_ids(
+        self, fetch_gbfs_feeds_by_stable_ids, mock_publisher_client
+    ):
+        # Prepare mocks
+        mock_session = MagicMock()
+
+        mock_publisher = MagicMock()
+        mock_publisher_client.return_value = mock_publisher
+
+        mock_feed = MagicMock()
+        mock_feed.stable_id = "mock-stable-id"
+        mock_feed.id = str(uuid.uuid4())
+        mock_feed.auto_discovery_url = "http://mock-url.com"
+        mock_feed.gbfsversions = [MagicMock(version="1.0")]
+        mock_feed_2 = copy.deepcopy(mock_feed)
+        mock_feed_2.gbfsversions = []
+        fetch_gbfs_feeds_by_stable_ids.return_value = [mock_feed, mock_feed_2]
+        request = MagicMock()
+        request.method = "POST"
+        request.is_json = True
+        request.get_json.return_value = {
+            "feed_stable_ids": [mock_feed.id, mock_feed_2.id]
+        }
+        # Call the function
+        result = gbfs_validator_batch(request, db_session=mock_session)
+        self.assertEqual(result[1], 200)
+
+        fetch_gbfs_feeds_by_stable_ids.assert_called_once()
+        self.assertEqual(mock_publisher.publish.call_count, 2)
diff --git a/functions-python/helpers/logger.py b/functions-python/helpers/logger.py