fix: increase batch to 500 and delete visualization join in the CSV export function (#1379)

davidgamez · web-flow · commit ee2303df1f01 · 2025-10-06T12:34:08.000-04:00
diff --git a/api/src/shared/common/db_utils.py b/api/src/shared/common/db_utils.py
@@ -164,7 +164,7 @@ def get_all_gtfs_feeds(
 
     :return: The GTFS feeds in an iterator.
     """
-    batch_size = os.getenv("BATCH_SIZE", 100)
+    batch_size = int(os.getenv("BATCH_SIZE", "500"))
     batch_query = db_session.query(Gtfsfeed).order_by(Gtfsfeed.stable_id).yield_per(batch_size)
     if published_only:
         batch_query = batch_query.filter(Gtfsfeed.operational_status == "published")
@@ -182,7 +182,6 @@ def get_all_gtfs_feeds(
                     contains_eager(Gtfsfeed.gtfsdatasets)
                     .joinedload(Gtfsdataset.validation_reports)
                     .joinedload(Validationreport.features),
-                    joinedload(Gtfsfeed.visualization_dataset),
                     *get_joinedload_options(include_extracted_location_entities=True),
                 )
             )
diff --git a/functions-python/export_csv/src/main.py b/functions-python/export_csv/src/main.py
@@ -208,7 +208,6 @@ def get_gtfs_feed_csv_data(
     :return: Dictionary with feed data formatted for CSV output.
     """
     joined_features = ""
-    validated_at = None
     bounding_box = None
 
     # First extract the common feed data
@@ -240,18 +239,16 @@ def get_gtfs_feed_csv_data(
                     if features
                     else ""
                 )
-            if latest_report.validated_at:
-                validated_at = latest_report.validated_at
-        if latest_dataset.bounding_box:
-            shape = to_shape(latest_dataset.bounding_box)
-            if shape and shape.bounds:
-                bounding_box = BoundingBox(
-                    minimum_latitude=shape.bounds[1],
-                    maximum_latitude=shape.bounds[3],
-                    minimum_longitude=shape.bounds[0],
-                    maximum_longitude=shape.bounds[2],
-                    extracted_on=validated_at,
-                )
+    if feed.bounding_box:
+        shape = to_shape(feed.bounding_box)
+        if shape and shape.bounds:
+            bounding_box = BoundingBox(
+                minimum_latitude=shape.bounds[1],
+                maximum_latitude=shape.bounds[3],
+                minimum_longitude=shape.bounds[0],
+                maximum_longitude=shape.bounds[2],
+                extracted_on=feed.bounding_box_dataset.downloaded_at,
+            )
 
     # Keep the bounding box for that GTFS feed so it can be used in associated real-time feeds, if any
     if bounding_box:
diff --git a/functions-python/export_csv/tests/__init__.py b/functions-python/export_csv/tests/__init__.py
diff --git a/functions-python/export_csv/tests/conftest.py b/functions-python/export_csv/tests/conftest.py
@@ -156,7 +156,7 @@ def populate_database(db_session):
             hosted_url=f"https://url_prefix/{feed_stable_id}/dataset-{i}_some_fake_hosted_url",
             note=f"dataset-{i} Some fake note",
             hash=fake.sha256(),
-            downloaded_at=datetime.utcnow(),
+            downloaded_at=datetime(2025, 1, 12),
             stable_id=f"dataset-{i}",
         )
         validation_report = Validationreport(
@@ -175,6 +175,9 @@ def populate_database(db_session):
         gtfs_dataset.locations = locations
 
         active_gtfs_feeds[feed_index].gtfsdatasets.append(gtfs_dataset)
+        db_session.flush()
+        active_gtfs_feeds[feed_index].bounding_box = gtfs_dataset.bounding_box
+        active_gtfs_feeds[feed_index].bounding_box_dataset_id = gtfs_dataset.id
     active_gtfs_feeds[0].locations = locations
     active_gtfs_feeds[1].locations = locations
 
diff --git a/functions-python/export_csv/tests/test_export_csv_main.py b/functions-python/export_csv/tests/test_export_csv_main.py
@@ -14,6 +14,7 @@
 #  limitations under the License.
 #
 import io
+import unittest
 
 import pandas as pd
 import pandas.testing as pdt
@@ -34,13 +35,14 @@
 """  # noqa
 
 
-def test_export_csv():
-    csv_file_path = "./output.csv"
-    main.export_csv(csv_file_path)
-    df_actual = pd.read_csv(csv_file_path)
-    print(f"Collected data for {len(df_actual)} feeds.")
+class TestExportCSV(unittest.TestCase):
+    def test_export_csv(self):
+        csv_file_path = "./output.csv"
+        main.export_csv(csv_file_path)
+        df_actual = pd.read_csv(csv_file_path)
+        print(f"Collected data for {len(df_actual)} feeds.")
 
-    df_expected = pd.read_csv(io.StringIO(expected_csv))
+        df_expected = pd.read_csv(io.StringIO(expected_csv))
 
-    pdt.assert_frame_equal(df_actual, df_expected)
-    print("DataFrames are equal.")
+        pdt.assert_frame_equal(df_actual, df_expected)
+        print("DataFrames are equal.")