Skip to content

Commit 1e22f75

Browse files
authored
feat: infra TL scraping (#796)
1 parent d5c46c2 commit 1e22f75

File tree

14 files changed

+177
-55
lines changed

14 files changed

+177
-55
lines changed

.github/workflows/api-deployer.yml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ on:
1212
OAUTH2_CLIENT_SECRET:
1313
description: Oauth client secret part of the the Identity Aware Proxy configuration
1414
required: true
15+
OP_SERVICE_ACCOUNT_TOKEN:
16+
description: 1Password service account token
17+
required: true
1518
inputs:
1619
ENVIRONMENT:
1720
description: API environment. Possible values prod, staging and dev
@@ -280,10 +283,18 @@ jobs:
280283
echo "GLOBAL_RATE_LIMIT_REQ_PER_MINUTE=${{ inputs.GLOBAL_RATE_LIMIT_REQ_PER_MINUTE }}" >> $GITHUB_ENV
281284
echo "VALIDATOR_ENDPOINT=${{ inputs.VALIDATOR_ENDPOINT }}" >> $GITHUB_ENV
282285
286+
- name: Load secret from 1Password
287+
uses: 1password/load-secrets-action@v2
288+
with:
289+
export-env: true
290+
env:
291+
OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}
292+
TRANSITLAND_API_KEY: "op://rbiv7rvkkrsdlpcrz3bmv7nmcu/TansitLand API Key/credential"
293+
283294
- name: Populate Variables
284295
run: |
285296
scripts/replace-variables.sh -in_file infra/backend.conf.rename_me -out_file infra/backend.conf -variables BUCKET_NAME,OBJECT_PREFIX
286-
scripts/replace-variables.sh -in_file infra/vars.tfvars.rename_me -out_file infra/vars.tfvars -variables PROJECT_ID,REGION,ENVIRONMENT,DEPLOYER_SERVICE_ACCOUNT,FEED_API_IMAGE_VERSION,OAUTH2_CLIENT_ID,OAUTH2_CLIENT_SECRET,GLOBAL_RATE_LIMIT_REQ_PER_MINUTE,ARTIFACT_REPO_NAME,VALIDATOR_ENDPOINT
297+
scripts/replace-variables.sh -in_file infra/vars.tfvars.rename_me -out_file infra/vars.tfvars -variables PROJECT_ID,REGION,ENVIRONMENT,DEPLOYER_SERVICE_ACCOUNT,FEED_API_IMAGE_VERSION,OAUTH2_CLIENT_ID,OAUTH2_CLIENT_SECRET,GLOBAL_RATE_LIMIT_REQ_PER_MINUTE,ARTIFACT_REPO_NAME,VALIDATOR_ENDPOINT,TRANSITLAND_API_KEY
287298
288299
- uses: hashicorp/setup-terraform@v3
289300
with:

.github/workflows/api-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ jobs:
2626
GCP_MOBILITY_FEEDS_SA_KEY: ${{ secrets.DEV_GCP_MOBILITY_FEEDS_SA_KEY }}
2727
OAUTH2_CLIENT_ID: ${{ secrets.DEV_MOBILITY_FEEDS_OAUTH2_CLIENT_ID}}
2828
OAUTH2_CLIENT_SECRET: ${{ secrets.DEV_MOBILITY_FEEDS_OAUTH2_CLIENT_SECRET}}
29+
OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}
2930

3031
integration-tests:
3132
if: ${{ github.event.inputs.run_integration_tests == 'true' }}

.github/workflows/api-prod.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ jobs:
2222
GCP_MOBILITY_FEEDS_SA_KEY: ${{ secrets.PROD_GCP_MOBILITY_FEEDS_SA_KEY }}
2323
OAUTH2_CLIENT_ID: ${{ secrets.PROD_MOBILITY_FEEDS_OAUTH2_CLIENT_ID}}
2424
OAUTH2_CLIENT_SECRET: ${{ secrets.PROD_MOBILITY_FEEDS_OAUTH2_CLIENT_SECRET}}
25+
OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}

.github/workflows/api-qa.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ jobs:
2222
GCP_MOBILITY_FEEDS_SA_KEY: ${{ secrets.QA_GCP_MOBILITY_FEEDS_SA_KEY }}
2323
OAUTH2_CLIENT_ID: ${{ secrets.DEV_MOBILITY_FEEDS_OAUTH2_CLIENT_ID}}
2424
OAUTH2_CLIENT_SECRET: ${{ secrets.DEV_MOBILITY_FEEDS_OAUTH2_CLIENT_SECRET}}
25-
25+
OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}

functions-python/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ The function configuration file contains the following properties:
3131
- `max_instance_count`: The maximum number of function instances that can be created in response to a load.
3232
- `min_instance_count`: The minimum number of function instances that can be created in response to a load.
3333
- `available_cpu_count`: The number of CPU cores that are available to the function.
34+
- `available_memory`: The amount of memory available to the function.
3435

3536
# Local Setup
3637

functions-python/feed_sync_dispatcher_transitland/function_config.json

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "feed-sync-dispatcher-transitland",
33
"description": "Feed Sync Dispatcher for Transitland",
44
"entry_point": "feed_sync_dispatcher_transitland",
5-
"timeout": 540,
5+
"timeout": 3600,
66
"memory": "512Mi",
77
"trigger_http": true,
88
"include_folders": ["database_gen", "helpers"],
@@ -11,9 +11,10 @@
1111
"key": "FEEDS_DATABASE_URL"
1212
}
1313
],
14-
"ingress_settings": "ALLOW_INTERNAL_AND_GCLB",
15-
"max_instance_request_concurrency": 20,
16-
"max_instance_count": 10,
14+
"ingress_settings": "ALLOW_ALL",
15+
"max_instance_request_concurrency": 1,
16+
"max_instance_count": 1,
1717
"min_instance_count": 0,
18-
"available_cpu": 1
18+
"available_cpu": 1,
19+
"available_memory": "512Mi"
1920
}

functions-python/feed_sync_dispatcher_transitland/src/main.py

Lines changed: 63 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -15,35 +15,34 @@
1515
#
1616

1717
import json
18-
import os
1918
import logging
20-
import time
19+
import os
2120
import random
21+
import time
2222
from dataclasses import dataclass, asdict
2323
from typing import Optional, List
24-
import requests
25-
from requests.exceptions import RequestException, HTTPError
26-
import pandas as pd
2724

2825
import functions_framework
26+
import pandas as pd
27+
import requests
2928
from google.cloud.pubsub_v1.futures import Future
29+
from requests.exceptions import RequestException, HTTPError
3030
from sqlalchemy.orm import Session
31-
from sqlalchemy import text
3231

32+
from database_gen.sqlacodegen_models import Gtfsfeed
3333
from helpers.feed_sync.feed_sync_common import FeedSyncProcessor, FeedSyncPayload
3434
from helpers.feed_sync.feed_sync_dispatcher import feed_sync_dispatcher
35+
from helpers.logger import Logger
3536
from helpers.pub_sub import get_pubsub_client, get_execution_id
3637

3738
# Logging configuration
38-
logging.basicConfig(
39-
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
40-
)
39+
logging.basicConfig(level=logging.INFO)
4140

4241
# Environment variables
4342
PUBSUB_TOPIC_NAME = os.getenv("PUBSUB_TOPIC_NAME")
4443
PROJECT_ID = os.getenv("PROJECT_ID")
4544
FEEDS_DATABASE_URL = os.getenv("FEEDS_DATABASE_URL")
46-
apikey = os.getenv("TRANSITLAND_API_KEY")
45+
TRANSITLAND_API_KEY = os.getenv("TRANSITLAND_API_KEY")
4746
TRANSITLAND_OPERATOR_URL = os.getenv("TRANSITLAND_OPERATOR_URL")
4847
TRANSITLAND_FEED_URL = os.getenv("TRANSITLAND_FEED_URL")
4948
spec = ["gtfs", "gtfs-rt"]
@@ -83,11 +82,16 @@ def to_json(self):
8382
class TransitFeedSyncProcessor(FeedSyncProcessor):
8483
def check_url_status(self, url: str) -> bool:
8584
"""
86-
Checks if a URL returns a valid response (not 404 or 500).
85+
Checks if a URL returns a valid response status code.
8786
"""
8887
try:
88+
logging.info(f"Checking URL: {url}")
89+
if url is None or len(url) == 0:
90+
logging.warning("URL is empty. Skipping check.")
91+
return False
8992
response = requests.head(url, timeout=25)
90-
return response.status_code not in {404, 500}
93+
logging.info(f"URL status code: {response.status_code}")
94+
return response.status_code < 400
9195
except requests.RequestException as e:
9296
logging.warning(f"Failed to reach {url}: {e}")
9397
return False
@@ -99,9 +103,17 @@ def process_sync(
99103
Process data synchronously to fetch, extract, combine, filter and prepare payloads for publishing
100104
to a queue based on conditions related to the data retrieved from TransitLand API.
101105
"""
102-
feeds_data = self.get_data(TRANSITLAND_FEED_URL, apikey, spec, session)
106+
feeds_data = self.get_data(
107+
TRANSITLAND_FEED_URL, TRANSITLAND_API_KEY, spec, session
108+
)
109+
logging.info("Fetched %s feeds from TransitLand API", len(feeds_data["feeds"]))
110+
103111
operators_data = self.get_data(
104-
TRANSITLAND_OPERATOR_URL, apikey, session=session
112+
TRANSITLAND_OPERATOR_URL, TRANSITLAND_API_KEY, session=session
113+
)
114+
logging.info(
115+
"Fetched %s operators from TransitLand API",
116+
len(operators_data["operators"]),
105117
)
106118

107119
feeds = self.extract_feeds_data(feeds_data)
@@ -151,12 +163,25 @@ def process_sync(
151163
.str.lower()
152164
.isin([c.lower() for c in countries_not_included])
153165
]
166+
logging.info(
167+
"Filtered out %s feeds from countries: %s",
168+
len(df_grouped) - len(filtered_df),
169+
countries_not_included,
170+
)
154171

155172
# Filtered out URLs that return undesired status codes
173+
filtered_df = filtered_df.drop_duplicates(
174+
subset=["feed_url"]
175+
) # Drop duplicates
156176
filtered_df = filtered_df[filtered_df["feed_url"].apply(self.check_url_status)]
177+
logging.info(
178+
"Filtered out %s feeds with invalid URLs",
179+
len(df_grouped) - len(filtered_df),
180+
)
157181

158182
# Convert filtered DataFrame to dictionary format
159183
combined_data = filtered_df.to_dict(orient="records")
184+
logging.info("Prepared %s feeds for publishing", len(combined_data))
160185

161186
payloads = []
162187
for data in combined_data:
@@ -197,7 +222,7 @@ def process_sync(
197222
def get_data(
198223
self,
199224
url,
200-
apikey,
225+
api_key,
201226
spec=None,
202227
session=None,
203228
max_retries=3,
@@ -209,11 +234,13 @@ def get_data(
209234
Handles rate limits, retries, and error cases.
210235
Returns the parsed data as a dictionary containing feeds and operators.
211236
"""
212-
headers = {"apikey": apikey}
237+
headers = {"apikey": api_key}
213238
params = {"spec": spec} if spec else {}
214239
all_data = {"feeds": [], "operators": []}
215240
delay = initial_delay
241+
response = None
216242

243+
logging.info("Fetching data from %s", url)
217244
while url:
218245
for attempt in range(max_retries):
219246
try:
@@ -225,12 +252,17 @@ def get_data(
225252
all_data["feeds"].extend(data.get("feeds", []))
226253
all_data["operators"].extend(data.get("operators", []))
227254
url = data.get("meta", {}).get("next")
255+
logging.info(
256+
"Fetched %s feeds and %s operators",
257+
len(all_data["feeds"]),
258+
len(all_data["operators"]),
259+
)
260+
logging.info("Next URL: %s", url)
228261
delay = initial_delay
229262
break
230-
231263
except (RequestException, HTTPError) as e:
232264
logging.error("Attempt %s failed: %s", attempt + 1, e)
233-
if response.status_code == 429:
265+
if response is not None and response.status_code == 429:
234266
logging.warning("Rate limit hit. Waiting for %s seconds", delay)
235267
time.sleep(delay + random.uniform(0, 1))
236268
delay = min(delay * 2, max_delay)
@@ -240,7 +272,9 @@ def get_data(
240272
)
241273
return all_data
242274
else:
275+
logging.info("Retrying in %s seconds", delay)
243276
time.sleep(delay)
277+
logging.info("Finished fetching data.")
244278
return all_data
245279

246280
def extract_feeds_data(self, feeds_data: dict) -> List[dict]:
@@ -297,13 +331,12 @@ def check_external_id(
297331
:param source: The source to filter by (e.g., 'TLD' for TransitLand)
298332
:return: True if the feed exists, False otherwise
299333
"""
300-
query = text(
301-
"SELECT 1 FROM public.externalid WHERE associated_id = :external_id AND source = :source LIMIT 1"
334+
results = (
335+
db_session.query(Gtfsfeed)
336+
.filter(Gtfsfeed.externalids.any(associated_id=external_id))
337+
.all()
302338
)
303-
result = db_session.execute(
304-
query, {"external_id": external_id, "source": source}
305-
).fetchone()
306-
return result is not None
339+
return results is not None and len(results) > 0
307340

308341
def get_mbd_feed_url(
309342
self, db_session: Session, external_id: str, source: str
@@ -315,19 +348,12 @@ def get_mbd_feed_url(
315348
:param source: The source to filter by (e.g., 'TLD' for TransitLand)
316349
:return: feed_url in mbd if exists, otherwise None
317350
"""
318-
query = text(
319-
"""
320-
SELECT f.producer_url
321-
FROM public.feed f
322-
JOIN public.externalid e ON f.id = e.feed_id
323-
WHERE e.associated_id = :external_id AND e.source = :source
324-
LIMIT 1
325-
"""
351+
results = (
352+
db_session.query(Gtfsfeed)
353+
.filter(Gtfsfeed.externalids.any(associated_id=external_id))
354+
.all()
326355
)
327-
result = db_session.execute(
328-
query, {"external_id": external_id, "source": source}
329-
).fetchone()
330-
return result[0] if result else None
356+
return results[0].producer_url if results else None
331357

332358
def publish_callback(
333359
self, future: Future, payload: FeedSyncPayload, topic_path: str
@@ -350,6 +376,7 @@ def feed_sync_dispatcher_transitland(request):
350376
"""
351377
HTTP Function entry point queries the transitland API and publishes events to a Pub/Sub topic to be processed.
352378
"""
379+
Logger.init_logger()
353380
publisher = get_pubsub_client()
354381
topic_path = publisher.topic_path(PROJECT_ID, PUBSUB_TOPIC_NAME)
355382
transit_land_feed_sync_processor = TransitFeedSyncProcessor()

functions-python/feed_sync_dispatcher_transitland/tests/test_feed_sync.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from unittest.mock import Mock, patch, call
33
from requests import Session as RequestsSession
44
from sqlalchemy.orm import Session as DBSession
5+
6+
from database_gen.sqlacodegen_models import Gtfsfeed
57
from feed_sync_dispatcher_transitland.src.main import (
68
TransitFeedSyncProcessor,
79
FeedSyncPayload,
@@ -90,24 +92,24 @@ def test_extract_operators_data(processor):
9092

9193
def test_check_external_id(processor):
9294
mock_db_session = Mock(spec=DBSession)
93-
mock_db_session.execute.return_value.fetchone.return_value = (1,)
95+
mock_db_session.query.return_value.filter.return_value.all.return_value = (1,)
9496
result = processor.check_external_id(mock_db_session, "onestop1", "TLD")
9597
assert result is True
9698

97-
mock_db_session.execute.return_value.fetchone.return_value = None
99+
mock_db_session.query.return_value.filter.return_value.all.return_value = None
98100
result = processor.check_external_id(mock_db_session, "onestop2", "TLD")
99101
assert result is False
100102

101103

102104
def test_get_mbd_feed_url(processor):
103105
mock_db_session = Mock(spec=DBSession)
104-
mock_db_session.execute.return_value.fetchone.return_value = (
105-
"http://example.com/feed1",
106-
)
106+
mock_db_session.query.return_value.filter.return_value.all.return_value = [
107+
Gtfsfeed(producer_url="http://example.com/feed1")
108+
]
107109
result = processor.get_mbd_feed_url(mock_db_session, "onestop1", "TLD")
108110
assert result == "http://example.com/feed1"
109111

110-
mock_db_session.execute.return_value.fetchone.return_value = None
112+
mock_db_session.query.return_value.filter.return_value.all.return_value = None
111113
result = processor.get_mbd_feed_url(mock_db_session, "onestop2", "TLD")
112114
assert result is None
113115

@@ -343,7 +345,7 @@ def test_get_data_retries(processor):
343345
with patch("time.sleep", return_value=None) as mock_sleep:
344346
result = processor.get_data(
345347
url="http://example.com",
346-
apikey="dummy_api_key",
348+
api_key="dummy_api_key",
347349
session=mock_session,
348350
max_retries=3,
349351
initial_delay=1,

functions-python/helpers/feed_sync/feed_sync_dispatcher.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def feed_sync_dispatcher(
3535
"""
3636
publisher = get_pubsub_client()
3737
try:
38-
session = start_db_session(os.getenv("FEEDS_DATABASE_URL"))
38+
session = start_db_session(os.getenv("FEEDS_DATABASE_URL"), echo=False)
3939
payloads = feed_sync_processor.process_sync(session, execution_id)
4040
except Exception as error:
4141
logging.error(f"Error processing feeds sync: {error}")
@@ -47,7 +47,7 @@ def feed_sync_dispatcher(
4747

4848
for payload in payloads:
4949
data_str = json.dumps(payload.payload.__dict__)
50-
print(f"Publishing {data_str} to {pubsub_topic_path}.")
50+
logging.info(f"Publishing {data_str} to {pubsub_topic_path}.")
5151
future = publish(publisher, pubsub_topic_path, data_str.encode("utf-8"))
5252
future.add_done_callback(
5353
lambda _: feed_sync_processor.publish_callback(

0 commit comments

Comments
 (0)