MobilityData
diff --git a/‎api/src/shared/common/db_utils.py‎
Lines changed: 47 additions & 0 deletions b/‎api/src/shared/common/db_utils.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎functions-python/export_csv/function_config.json‎
Lines changed: 1 addition & 1 deletion b/‎functions-python/export_csv/function_config.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎functions-python/export_csv/requirements.txt‎
Lines changed: 3 additions & 1 deletion b/‎functions-python/export_csv/requirements.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎functions-python/export_csv/src/main.py‎
Lines changed: 34 additions & 32 deletions b/‎functions-python/export_csv/src/main.py‎
Lines changed: 34 additions & 32 deletions
@@ -74,6 +74,29 @@ def get_gtfs_feeds_query(
     return feed_query
 
 
+def get_all_gtfs_feeds_query(
+    include_wip: bool = False,
+    db_session: Session = None,
+) -> Query[any]:
+    """Get the DB query to use to retrieve all the GTFS feeds, filtering out the WIP is needed"""
+
+    feed_query = db_session.query(Gtfsfeed)
+
+    if not include_wip:
+        feed_query = feed_query.filter(
+            or_(Gtfsfeed.operational_status == None, Gtfsfeed.operational_status != "wip")  # noqa: E711
+        )
+
+    feed_query = feed_query.options(
+        joinedload(Gtfsfeed.gtfsdatasets)
+        .joinedload(Gtfsdataset.validation_reports)
+        .joinedload(Validationreport.notices),
+        *get_joinedload_options(),
+    ).order_by(Gtfsfeed.stable_id)
+
+    return feed_query
+
+
 def get_gtfs_rt_feeds_query(
     limit: int | None,
     offset: int | None,
@@ -137,6 +160,30 @@ def get_gtfs_rt_feeds_query(
     return feed_query
 
 
+def get_all_gtfs_rt_feeds_query(
+    include_wip: bool = False,
+    db_session: Session = None,
+) -> Query:
+    """Get the DB query to use to retrieve all the GTFS rt feeds, filtering out the WIP is needed"""
+    feed_query = db_session.query(Gtfsrealtimefeed)
+
+    if not include_wip:
+        feed_query = feed_query.filter(
+            or_(
+                Gtfsrealtimefeed.operational_status == None,  # noqa: E711
+                Gtfsrealtimefeed.operational_status != "wip",
+            )
+        )
+
+    feed_query = feed_query.options(
+        joinedload(Gtfsrealtimefeed.entitytypes),
+        joinedload(Gtfsrealtimefeed.gtfs_feeds),
+        *get_joinedload_options(),
+    ).order_by(Gtfsfeed.stable_id)
+
+    return feed_query
+
+
 def apply_bounding_filtering(
     query: Query,
     bounding_latitudes: str,
 
@@ -3,7 +3,7 @@
   "description": "Export the DB feed data as a csv file",
   "entry_point": "export_csv",
   "timeout": 20,
-  "memory": "256Mi",
+  "memory": "1Gi",
   "trigger_http": true,
   "include_folders": ["helpers", "dataset_service"],
   "include_api_folders": ["utils", "database", "feed_filters", "common", "database_gen"],
 
@@ -9,9 +9,10 @@ requests~=2.32.3
 attrs~=23.1.0
 pluggy~=1.3.0
 certifi~=2024.7.4
-pandas
+pandas~=2.2.3
 python-dotenv==1.0.0
 fastapi-filter[sqlalchemy]==1.0.0
+packaging~=24.2
 
 # SQL Alchemy and Geo Alchemy
 SQLAlchemy==2.0.23
@@ -22,3 +23,4 @@ shapely
 google-cloud-pubsub
 google-cloud-datastore
 cloudevents~=1.10.1
+
@@ -14,8 +14,10 @@
 #  limitations under the License.
 #
 import argparse
-import pandas as pd
 import os
+import re
+
+import pandas as pd
 
 from dotenv import load_dotenv
 import functions_framework
@@ -27,7 +29,7 @@
 
 from shared.database_gen.sqlacodegen_models import Gtfsfeed, Gtfsrealtimefeed
 from collections import OrderedDict
-from shared.common.db_utils import get_gtfs_feeds_query, get_gtfs_rt_feeds_query
+from shared.common.db_utils import get_all_gtfs_rt_feeds_query, get_all_gtfs_feeds_query
 
 from shared.helpers.database import Database
 
@@ -57,10 +59,13 @@ def finalize_row(self):
         self.rows.append(self.data.copy())
         self.data = OrderedDict()
 
-    def write_csv(self, csv_file_path):
+    def write_csv_to_file(self, csv_file_path):
         df = pd.DataFrame(self.rows, columns=self.headers)
         df.to_csv(csv_file_path, index=False)
 
+    def get_dataframe(self) -> pd:
+        return pd.DataFrame(self.rows, columns=self.headers)
+
 
 @functions_framework.http
 def export_csv(request=None):
@@ -71,21 +76,20 @@ def export_csv(request=None):
     :param request: HTTP request object
     :return: HTTP response object
     """
+    data_collector = collect_data()
+    data_collector.write_csv_to_file(csv_file_path)
+    return f"Export of database feeds to CSV file {csv_file_path}."
+
+
+def collect_data() -> DataCollector:
+    """
+    Collect data from the DB and write the output to a DataCollector.
+    :return: A filled DataCollector
+    """
     db = Database(database_url=os.getenv("FEEDS_DATABASE_URL"))
     try:
         with db.start_db_session() as session:
-            gtfs_feeds_query = get_gtfs_feeds_query(
-                limit=None,
-                offset=0,
-                provider=None,
-                producer_url=None,
-                country_code=None,
-                subdivision_name=None,
-                municipality=None,
-                dataset_latitudes=None,
-                dataset_longitudes=None,
-                bounding_filter_method=None,
-                is_official=None,
+            gtfs_feeds_query = get_all_gtfs_feeds_query(
                 include_wip=False,
                 db_session=session,
             )
@@ -94,16 +98,7 @@ def export_csv(request=None):
 
             print(f"Retrieved {len(gtfs_feeds)} GTFS feeds.")
 
-            gtfs_rt_feeds_query = get_gtfs_rt_feeds_query(
-                limit=None,
-                offset=0,
-                provider=None,
-                producer_url=None,
-                entity_types=None,
-                country_code=None,
-                subdivision_name=None,
-                municipality=None,
-                is_official=None,
+            gtfs_rt_feeds_query = get_all_gtfs_rt_feeds_query(
                 include_wip=False,
                 db_session=session,
             )
@@ -134,11 +129,13 @@ def export_csv(request=None):
     except Exception as error:
         print(f"Error retrieving feeds: {error}")
         raise Exception(f"Error retrieving feeds: {error}")
+    data_collector.write_csv_to_file(csv_file_path)
+    return data_collector
 
-    data_collector.write_csv(csv_file_path)
 
-    print(f"Wrote {len(gtfs_feeds)} feeds to {csv_file_path}.")
-    return f"Wrote {len(gtfs_feeds)} feeds to {csv_file_path}."
+def extract_numeric_version(version):
+    match = re.match(r"(\d+\.\d+\.\d+)", version)
+    return match.group(1) if match else version
 
 
 def get_feed_csv_data(feed: Gtfsfeed):
@@ -162,15 +159,19 @@ def get_feed_csv_data(feed: Gtfsfeed):
         # Keep the report from the more recent validator version
         latest_report = reduce(
             lambda a, b: a
-            if Version(a.validator_version) > Version(b.validator_version)
+            if Version(extract_numeric_version(a.validator_version))
+            > Version(extract_numeric_version(b.validator_version))
             else b,
             latest_dataset.validation_reports,
         )
+
         if latest_report:
             if latest_report.features:
                 features = latest_report.features
                 joined_features = (
-                    "|".join(feature.name for feature in features if feature.name)
+                    "|".join(
+                        sorted(feature.name for feature in features if feature.name)
+                    )
                     if features
                     else ""
                 )
@@ -185,7 +186,7 @@ def get_feed_csv_data(feed: Gtfsfeed):
                 maximum_longitude = shape.bounds[2]
 
     data = {
-        "mdb_source_id": feed.stable_id,
+        "id": feed.stable_id,
         "data_type": feed.data_type,
         "entity_type": None,
         "location.country_code": ""
@@ -262,6 +263,7 @@ def get_gtfs_rt_feed_csv_data(feed: Gtfsrealtimefeed):
             for entity_type in feed.entitytypes
             if entity_type and entity_type.name
         ]
+        valid_entity_types = sorted(valid_entity_types)
         entity_types = "|".join(valid_entity_types)
 
     static_references = ""
@@ -274,7 +276,7 @@ def get_gtfs_rt_feed_csv_data(feed: Gtfsrealtimefeed):
         static_references = "|".join(valid_feed_references)
 
     data = {
-        "mdb_source_id": feed.stable_id,
+        "id": feed.stable_id,
         "data_type": feed.data_type,
         "entity_type": entity_types,
         "location.country_code": ""