diff --git a/src/models/europython.py b/src/models/europython.py index 689200b..c6b381d 100644 --- a/src/models/europython.py +++ b/src/models/europython.py @@ -241,39 +241,59 @@ def _clean_social_input(text: str) -> str | None: removes "http://" or "https://", removes "www." prefix, removes "@" prefix, + removes invisible Unicode control characters, and decodes URL-encoded characters. """ if EuroPythonSpeaker._is_blank_or_na(text): print(f"Blank or N/A input: {text}") return None + # Strip leading/trailing whitespace text = text.strip() - # Handle inputs like "LinkedIn: https://linkedin.com/in/username" - # or "GH: https://github.com/username" + # Remove any text prefix like "LinkedIn: " or "GH: " text = text.split(" ", 1)[1] if ": " in text else text + # Remove query strings and trailing commas or slashes text = text.split("?", 1)[0] text = text.split(",", 1)[0] text = text.rstrip("/") + # Remove URL schemes if text.startswith("https://"): text = text[8:] elif text.startswith("http://"): text = text[7:] + # Remove "www." prefix if text.startswith("www."): text = text[4:] - # Remove @ if present + # Remove leading @ if text.startswith("@"): text = text[1:] - # Percent-encode non-ASCII characters + # Remove invisible Unicode control characters (Bidi, LTR/RTL marks, etc.) + invisible_chars = [ + "\u200e", + "\u200f", # LTR / RTL marks + "\u202a", + "\u202b", + "\u202c", + "\u202d", + "\u202e", # Directional overrides + "\u2066", + "\u2067", + "\u2068", + "\u2069", # Isolates + ] + text = re.sub(f"[{''.join(invisible_chars)}]", "", text) + + # Percent-encode if needed (e.g., non-ASCII chars) if not text.isascii(): text = quote(text, safe="@/-_.+~#=:") - return text.lower() + return text.lower() if text else None class EuroPythonSession(BaseModel): @@ -292,7 +312,7 @@ class EuroPythonSession(BaseModel): duration: str = "" level: str = "" delivery: str = "" - resources: list[dict[str, str]] | None = None + resources: list[dict[str, str | None]] | None = None room: str | None = None start: datetime | None = None end: datetime | None = None diff --git a/src/models/pretalx.py b/src/models/pretalx.py index 560db85..f5a56c3 100644 --- a/src/models/pretalx.py +++ b/src/models/pretalx.py @@ -63,7 +63,7 @@ class PretalxSubmission(BaseModel): state: SubmissionState abstract: str = "" duration: str = "" - resources: list[dict[str, str]] | None = None + resources: list[dict[str, str | None]] | None = None answers: list[PretalxAnswer] slots: list[PretalxSlot] = Field(default_factory=list, exclude=True) slot_count: int = Field(..., exclude=True) diff --git a/tests/test_extract_socials.py b/tests/test_extract_socials.py index f273182..ed6d8f5 100644 --- a/tests/test_extract_socials.py +++ b/tests/test_extract_socials.py @@ -65,6 +65,7 @@ def test_extract_linkedin_url(input_string, result): @pytest.mark.parametrize( ("input_string", "result"), [ + ("@user.dev", "https://bsky.app/profile/user.dev"), ("user123", "https://bsky.app/profile/user123.bsky.social"), ("@user123", "https://bsky.app/profile/user123.bsky.social"), ("user123.bsky.social", "https://bsky.app/profile/user123.bsky.social"),