Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
81209e9
add license matcher as a task
davidgamez Nov 11, 2025
6e45d1d
add csv columns and improve logging
davidgamez Nov 11, 2025
7068f2f
add unit tests and fix lint
davidgamez Nov 11, 2025
3a23f5e
Add license change table and improve csv output
davidgamez Nov 12, 2025
3f1ddb6
Merge branch 'main' into feat/license_matcher
davidgamez Nov 12, 2025
2535075
Add regional id
davidgamez Nov 12, 2025
c71f7d3
add regional id to function response
davidgamez Nov 12, 2025
942a618
Merge branch 'main' into feat/license_matcher
davidgamez Nov 12, 2025
b8f8bd2
fix header parsing
davidgamez Nov 12, 2025
6361fad
add documentation
davidgamez Nov 12, 2025
edd353d
update documentation
davidgamez Nov 12, 2025
5da85e7
Update functions-python/tasks_executor/src/tasks/licenses/license_mat…
davidgamez Nov 12, 2025
025a1f8
Update liquibase/changes/feat_1433.sql
davidgamez Nov 12, 2025
22051bd
Update docs/LICENSES.md
davidgamez Nov 12, 2025
ecefcc0
Update functions-python/tasks_executor/src/main.py
davidgamez Nov 12, 2025
9dff3ad
Update functions-python/tasks_executor/src/tasks/licenses/license_mat…
davidgamez Nov 12, 2025
c336fc6
Update docs/LICENSES.md
davidgamez Nov 12, 2025
3b83cc5
Update api/tests/utils/test_license_utils.py
davidgamez Nov 12, 2025
3da61fd
Update api/tests/utils/test_license_utils.py
davidgamez Nov 12, 2025
37cae18
fix documentation
davidgamez Nov 12, 2025
bab9ea1
Add test and update documentation
davidgamez Nov 19, 2025
e46cf3c
Merge branch 'main' into feat/license_matcher
davidgamez Nov 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 69 additions & 1 deletion api/src/shared/common/db_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import re
from typing import Iterator, List, Dict, Optional

from geoalchemy2 import WKTElement
Expand Down Expand Up @@ -30,7 +31,7 @@
from .entity_type_enum import EntityType
from .error_handling import raise_internal_http_validation_error, invalid_bounding_coordinates, invalid_bounding_method
from .iter_utils import batched
from ..feed_filters.gbfs_feed_filter import GbfsFeedFilter, GbfsVersionFilter
from shared.feed_filters.gbfs_feed_filter import GbfsFeedFilter, GbfsVersionFilter


def get_gtfs_feeds_query(
Expand Down Expand Up @@ -511,3 +512,70 @@ def get_gbfs_feeds_query(
)
)
return query


def normalize_url(url_column) -> str:
"""
Normalize a URL by removing the protocol (http:// or https://), 'www.' prefix, and trailing slash.
This function generates a SQLAlchemy expression that can be used in queries.
Args:
url_column: The SQLAlchemy column representing the URL.
Returns:
A SQLAlchemy expression that normalizes the URL.
"""
return func.regexp_replace(
func.regexp_replace(
func.regexp_replace(url_column, r"^https?://", "", "gi"),
r"^www\.",
"",
"gi",
),
r"/$",
"",
"g",
)


def normalize_url_str(url: str | None) -> str:
"""Normalize a license URL for matching.
Steps:
- Trim whitespace and quotes
- Remove BOM characters
- Strip fragments and query parameters
- Remove scheme (http/https) and www prefix
- Lowercase the host
"""
u = (url or "").strip().strip("'\"").replace("\ufeff", "")
u = re.sub(r"#.*$", "", u)
u = re.sub(r"\?.*$", "", u)
u = re.sub(r"^https?://", "", u, flags=re.I)
u = re.sub(r"^www\.", "", u, flags=re.I)
# remove trailing slashes
u = re.sub(r"/+$", "", u)
if "/" in u:
host, rest = u.split("/", 1)
return host.lower() + "/" + rest
return u.lower()


def get_feed_query_by_normalized_url(url: str, db_session: Session) -> Query:
"""
Get a query to find the feed by normalized URL and exclude deprecated feeds.
Args:
url: The URL to normalize and search for.
db_session: SQLAlchemy session.
"""
return db_session.query(Feed).filter(
normalize_url_str(url) == func.lower(func.trim(normalize_url(Feed.producer_url))),
Feed.status != "deprecated",
)


def get_feed_by_normalized_url(url: str, db_session: Session) -> Feed | None:
"""
Query the feed by normalized URL and exclude deprecated feeds.
Args:
url: The URL to normalize and search for.
db_session: SQLAlchemy session.
"""
return get_feed_query_by_normalized_url(url, db_session).first()
Loading
Loading