Skip to content

Commit 4820180

Browse files
authored
feat: add license matcher task (#1453)
1 parent bf93aef commit 4820180

File tree

19 files changed

+1348
-40
lines changed

19 files changed

+1348
-40
lines changed

api/src/shared/common/db_utils.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import os
3+
import re
34
from typing import Iterator, List, Dict, Optional
45

56
from geoalchemy2 import WKTElement
@@ -30,7 +31,7 @@
3031
from .entity_type_enum import EntityType
3132
from .error_handling import raise_internal_http_validation_error, invalid_bounding_coordinates, invalid_bounding_method
3233
from .iter_utils import batched
33-
from ..feed_filters.gbfs_feed_filter import GbfsFeedFilter, GbfsVersionFilter
34+
from shared.feed_filters.gbfs_feed_filter import GbfsFeedFilter, GbfsVersionFilter
3435

3536

3637
def get_gtfs_feeds_query(
@@ -511,3 +512,70 @@ def get_gbfs_feeds_query(
511512
)
512513
)
513514
return query
515+
516+
517+
def normalize_url(url_column) -> str:
518+
"""
519+
Normalize a URL by removing the protocol (http:// or https://), 'www.' prefix, and trailing slash.
520+
This function generates a SQLAlchemy expression that can be used in queries.
521+
Args:
522+
url_column: The SQLAlchemy column representing the URL.
523+
Returns:
524+
A SQLAlchemy expression that normalizes the URL.
525+
"""
526+
return func.regexp_replace(
527+
func.regexp_replace(
528+
func.regexp_replace(url_column, r"^https?://", "", "gi"),
529+
r"^www\.",
530+
"",
531+
"gi",
532+
),
533+
r"/$",
534+
"",
535+
"g",
536+
)
537+
538+
539+
def normalize_url_str(url: str | None) -> str:
540+
"""Normalize a license URL for matching.
541+
Steps:
542+
- Trim whitespace and quotes
543+
- Remove BOM characters
544+
- Strip fragments and query parameters
545+
- Remove scheme (http/https) and www prefix
546+
- Lowercase the host
547+
"""
548+
u = (url or "").strip().strip("'\"").replace("\ufeff", "")
549+
u = re.sub(r"#.*$", "", u)
550+
u = re.sub(r"\?.*$", "", u)
551+
u = re.sub(r"^https?://", "", u, flags=re.I)
552+
u = re.sub(r"^www\.", "", u, flags=re.I)
553+
# remove trailing slashes
554+
u = re.sub(r"/+$", "", u)
555+
if "/" in u:
556+
host, rest = u.split("/", 1)
557+
return host.lower() + "/" + rest
558+
return u.lower()
559+
560+
561+
def get_feed_query_by_normalized_url(url: str, db_session: Session) -> Query:
562+
"""
563+
Get a query to find the feed by normalized URL and exclude deprecated feeds.
564+
Args:
565+
url: The URL to normalize and search for.
566+
db_session: SQLAlchemy session.
567+
"""
568+
return db_session.query(Feed).filter(
569+
normalize_url_str(url) == func.lower(func.trim(normalize_url(Feed.producer_url))),
570+
Feed.status != "deprecated",
571+
)
572+
573+
574+
def get_feed_by_normalized_url(url: str, db_session: Session) -> Feed | None:
575+
"""
576+
Query the feed by normalized URL and exclude deprecated feeds.
577+
Args:
578+
url: The URL to normalize and search for.
579+
db_session: SQLAlchemy session.
580+
"""
581+
return get_feed_query_by_normalized_url(url, db_session).first()

0 commit comments

Comments
 (0)