|
1 | 1 | import logging |
2 | 2 | import os |
| 3 | +import re |
3 | 4 | from typing import Iterator, List, Dict, Optional |
4 | 5 |
|
5 | 6 | from geoalchemy2 import WKTElement |
|
30 | 31 | from .entity_type_enum import EntityType |
31 | 32 | from .error_handling import raise_internal_http_validation_error, invalid_bounding_coordinates, invalid_bounding_method |
32 | 33 | from .iter_utils import batched |
33 | | -from ..feed_filters.gbfs_feed_filter import GbfsFeedFilter, GbfsVersionFilter |
| 34 | +from shared.feed_filters.gbfs_feed_filter import GbfsFeedFilter, GbfsVersionFilter |
34 | 35 |
|
35 | 36 |
|
36 | 37 | def get_gtfs_feeds_query( |
@@ -511,3 +512,70 @@ def get_gbfs_feeds_query( |
511 | 512 | ) |
512 | 513 | ) |
513 | 514 | return query |
| 515 | + |
| 516 | + |
| 517 | +def normalize_url(url_column) -> str: |
| 518 | + """ |
| 519 | + Normalize a URL by removing the protocol (http:// or https://), 'www.' prefix, and trailing slash. |
| 520 | + This function generates a SQLAlchemy expression that can be used in queries. |
| 521 | + Args: |
| 522 | + url_column: The SQLAlchemy column representing the URL. |
| 523 | + Returns: |
| 524 | + A SQLAlchemy expression that normalizes the URL. |
| 525 | + """ |
| 526 | + return func.regexp_replace( |
| 527 | + func.regexp_replace( |
| 528 | + func.regexp_replace(url_column, r"^https?://", "", "gi"), |
| 529 | + r"^www\.", |
| 530 | + "", |
| 531 | + "gi", |
| 532 | + ), |
| 533 | + r"/$", |
| 534 | + "", |
| 535 | + "g", |
| 536 | + ) |
| 537 | + |
| 538 | + |
| 539 | +def normalize_url_str(url: str | None) -> str: |
| 540 | + """Normalize a license URL for matching. |
| 541 | + Steps: |
| 542 | + - Trim whitespace and quotes |
| 543 | + - Remove BOM characters |
| 544 | + - Strip fragments and query parameters |
| 545 | + - Remove scheme (http/https) and www prefix |
| 546 | + - Lowercase the host |
| 547 | + """ |
| 548 | + u = (url or "").strip().strip("'\"").replace("\ufeff", "") |
| 549 | + u = re.sub(r"#.*$", "", u) |
| 550 | + u = re.sub(r"\?.*$", "", u) |
| 551 | + u = re.sub(r"^https?://", "", u, flags=re.I) |
| 552 | + u = re.sub(r"^www\.", "", u, flags=re.I) |
| 553 | + # remove trailing slashes |
| 554 | + u = re.sub(r"/+$", "", u) |
| 555 | + if "/" in u: |
| 556 | + host, rest = u.split("/", 1) |
| 557 | + return host.lower() + "/" + rest |
| 558 | + return u.lower() |
| 559 | + |
| 560 | + |
| 561 | +def get_feed_query_by_normalized_url(url: str, db_session: Session) -> Query: |
| 562 | + """ |
| 563 | + Get a query to find the feed by normalized URL and exclude deprecated feeds. |
| 564 | + Args: |
| 565 | + url: The URL to normalize and search for. |
| 566 | + db_session: SQLAlchemy session. |
| 567 | + """ |
| 568 | + return db_session.query(Feed).filter( |
| 569 | + normalize_url_str(url) == func.lower(func.trim(normalize_url(Feed.producer_url))), |
| 570 | + Feed.status != "deprecated", |
| 571 | + ) |
| 572 | + |
| 573 | + |
| 574 | +def get_feed_by_normalized_url(url: str, db_session: Session) -> Feed | None: |
| 575 | + """ |
| 576 | + Query the feed by normalized URL and exclude deprecated feeds. |
| 577 | + Args: |
| 578 | + url: The URL to normalize and search for. |
| 579 | + db_session: SQLAlchemy session. |
| 580 | + """ |
| 581 | + return get_feed_query_by_normalized_url(url, db_session).first() |
0 commit comments