11from __future__ import annotations
22
3+ import re
34from datetime import date , datetime
5+ from urllib .parse import quote
46
57from pydantic import BaseModel , Field , computed_field , field_validator , model_validator
68
@@ -29,7 +31,7 @@ class EuroPythonSpeaker(BaseModel):
2931 mastodon_url : str | None = None
3032 linkedin_url : str | None = None
3133 bluesky_url : str | None = None
32- gitx : str | None = None
34+ gitx_url : str | None = None
3335
3436 @computed_field
3537 def website_url (self ) -> str :
@@ -50,93 +52,210 @@ def extract_answers(cls, values) -> dict:
5052 values ["homepage" ] = answer .answer_text
5153
5254 if answer .question_text == SpeakerQuestion .twitter :
53- values ["twitter_url" ] = cls .extract_twitter_url (
54- answer .answer_text .strip ().split ()[0 ]
55- )
55+ values ["twitter_url" ] = cls .extract_twitter_url (answer .answer_text )
5656
5757 if answer .question_text == SpeakerQuestion .mastodon :
58- values ["mastodon_url" ] = cls .extract_mastodon_url (
59- answer .answer_text .strip ().split ()[0 ]
60- )
58+ values ["mastodon_url" ] = cls .extract_mastodon_url (answer .answer_text )
6159
6260 if answer .question_text == SpeakerQuestion .bluesky :
63- values ["bluesky_url" ] = cls .extract_bluesky_url (
64- answer .answer_text .strip ().split ()[0 ]
65- )
61+ values ["bluesky_url" ] = cls .extract_bluesky_url (answer .answer_text )
6662
6763 if answer .question_text == SpeakerQuestion .linkedin :
68- values ["linkedin_url" ] = cls .extract_linkedin_url (
69- answer .answer_text .strip ().split ()[0 ]
70- )
64+ values ["linkedin_url" ] = cls .extract_linkedin_url (answer .answer_text )
7165
7266 if answer .question_text == SpeakerQuestion .gitx :
73- values ["gitx " ] = answer .answer_text . strip (). split ()[ 0 ]
67+ values ["gitx_url " ] = cls . extract_gitx_url ( answer .answer_text )
7468
7569 return values
7670
7771 @staticmethod
78- def extract_twitter_url (text : str ) -> str :
72+ def extract_twitter_url (text : str ) -> str | None :
7973 """
80- Extract the Twitter URL from the answer
74+ Extracts a Twitter profile URL from the given text.
75+ Cleans the input and handles following formats:
76+ - @username
77+ - username
78+ - twitter.com/username
79+ - x.com/username
8180 """
82- if text .startswith ("@" ):
83- twitter_url = f"https://x.com/{ text [1 :]} "
84- elif not text .startswith (("https://" , "http://" , "www." )):
85- twitter_url = f"https://x.com/{ text } "
86- else :
87- twitter_url = (
88- f"https://{ text .removeprefix ('https://' ).removeprefix ('http://' )} "
89- )
81+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
82+ if cleaned is None :
83+ print (f"Invalid Twitter URL: { text } " )
84+ return None
9085
91- return twitter_url .split ("?" )[0 ]
86+ # https://twitter.com/username (username max 15 chars)
87+ match = re .match (r"^(twitter\.com|x\.com)/([\w]{1,15})$" , cleaned )
88+ if match :
89+ _ , username = match .groups ()
90+ return f"https://x.com/{ username } "
91+
92+ # only username
93+ if re .match (r"^[\w]{1,15}$" , cleaned ):
94+ return f"https://x.com/{ cleaned } "
95+
96+ print (f"Invalid Twitter URL: { cleaned } " )
97+ return None
9298
9399 @staticmethod
94- def extract_mastodon_url (text : str ) -> None | str :
100+ def extract_mastodon_url (text : str ) -> str | None :
95101 """
96- Normalize Mastodon handle or URL to the format: https://<instance>/@<username>
102+ Extracts a Mastodon profile URL from the given text.
103+ Supports formats like:
104+ - @username@instance
105+ - username@instance
106+ - instance/@username
107+ - instance/@username@instance (with redirect)
108+ Returns: https://<instance>/@<username>
97109 """
98- text = text .strip ().split ("?" , 1 )[0 ]
99-
100- # Handle @username@instance or username@instance formats
101- if "@" in text and not text .startswith ("http" ):
102- parts = text .split ("@" )
103- if len (parts ) == 3 : # @username@instance
104- _ , username , instance = parts
105- elif len (parts ) == 2 : # username@instance
106- username , instance = parts
107- else :
108- return None
110+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
111+ if not cleaned :
112+ print (f"Invalid Mastodon URL: { text } " )
113+ return None
114+
115+ # instance/@username
116+ match = re .match (r"^([\w\.-]+)/@([\w\.-]+)$" , cleaned )
117+ if match :
118+ instance , username = match .groups ()
109119 return f"https://{ instance } /@{ username } "
110120
111- # Handle full URLs
112- if text .startswith ("http://" ):
113- text = "https://" + text [len ("http://" ) :]
121+ parts = cleaned .split ("@" )
122+ if len (parts ) == 3 : # instance@username@instance
123+ _ , username , instance = parts
124+ elif len (parts ) == 2 : # username@instance
125+ username , instance = parts
126+ else :
127+ print (f"Invalid Mastodon URL: { cleaned } " )
128+ return None
129+
130+ if username and instance :
131+ return f"https://{ instance } /@{ username } "
114132
115- return text
133+ print (f"Invalid Mastodon URL: { cleaned } " )
134+ return None
116135
117136 @staticmethod
118- def extract_linkedin_url (text : str ) -> str :
137+ def extract_linkedin_url (text : str ) -> str | None :
119138 """
120- Extract the LinkedIn URL from the answer
139+ Extracts a LinkedIn personal profile URL from the given text.
140+ Cleans the input and handles formats like:
141+ - username
142+ - linkedin.com/in/username
143+ - @username
144+ - tr.linkedin.com/in/username (country subdomains)
121145 """
122- if text .startswith ("in/" ):
123- linkedin_url = f"https://linkedin.com/{ text } "
124- elif not text .startswith (("https://" , "http://" , "www." , "linkedin." )):
125- linkedin_url = f"https://linkedin.com/in/{ text } "
146+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
147+ if cleaned is None :
148+ print (f"Invalid LinkedIn URL: { text } " )
149+ return None
150+
151+ if cleaned .startswith ("in/" ):
152+ linkedin_url = f"https://linkedin.com/{ cleaned } "
153+ elif not cleaned .startswith (("linkedin." , "in/" )) and "." not in cleaned :
154+ linkedin_url = f"https://linkedin.com/in/{ cleaned } "
126155 else :
127- linkedin_url = (
128- f"https://{ text .removeprefix ('https://' ).removeprefix ('http://' )} "
129- )
156+ linkedin_url = f"https://{ cleaned } "
157+
158+ if not re .match (
159+ r"^https://([\w-]+\.)?linkedin\.com/in/(?:[\w\-]|%[0-9A-Fa-f]{2})+$" ,
160+ linkedin_url ,
161+ ):
162+ print (f"Invalid LinkedIn URL: { linkedin_url } " )
163+ return None
130164
131- return linkedin_url . split ( "?" )[ 0 ]
165+ return linkedin_url
132166
133167 @staticmethod
134- def extract_bluesky_url (text : str ) -> str :
168+ def extract_bluesky_url (text : str ) -> str | None :
135169 """
136- Returns a normalized BlueSky URL in the form https://bsky.app/profile/<USERNAME>.bsky.social,
137- or uses the entire domain if it's custom (e.g., .dev).
170+ Extracts a Bluesky profile URL from the given text.
171+ Cleans the input and handles formats like:
172+ - username
173+ - bsky.app/profile/username
174+ - bsky/username
175+ - username.dev
176+ - @username
177+ - username.bsky.social
138178 """
139- text = text .strip ().split ("?" , 1 )[0 ]
179+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
180+ if cleaned is None :
181+ print (f"Invalid Bluesky URL: { text } " )
182+ return None
183+
184+ for marker in ("bsky.app/profile/" , "bsky/" ):
185+ if marker in cleaned :
186+ cleaned = cleaned .split (marker , 1 )[1 ]
187+ break
188+ else :
189+ cleaned = cleaned .rsplit ("/" , 1 )[- 1 ]
190+
191+ if "." not in cleaned :
192+ cleaned += ".bsky.social"
193+
194+ bluesky_url = f"https://bsky.app/profile/{ cleaned } "
195+
196+ if not re .match (r"^https://bsky\.app/profile/[\w\.-]+\.[\w\.-]+$" , bluesky_url ):
197+ print (f"Invalid Bluesky URL: { bluesky_url } " )
198+ return None
199+
200+ return bluesky_url
201+
202+ @staticmethod
203+ def extract_gitx_url (text : str ) -> str | None :
204+ """
205+ Extracts a GitHub/GitLab URL from the given text.
206+ Cleans the input and handles formats like:
207+ - username
208+ - github.com/username
209+ - gitlab.com/username
210+ - @username
211+ """
212+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
213+ if cleaned is None :
214+ print (f"Invalid GitHub/GitLab URL: { text } " )
215+ return None
216+
217+ if cleaned .startswith (("github.com/" , "gitlab.com/" )):
218+ return f"https://{ cleaned } "
219+
220+ if re .match (r"^[\w-]+$" , cleaned ): # assume github.com
221+ return f"https://github.com/{ cleaned } "
222+
223+ print (f"Invalid GitHub/GitLab URL: { cleaned } " )
224+ return None
225+
226+ @staticmethod
227+ def _is_blank_or_na (text : str ) -> bool :
228+ """
229+ Check if the text is blank or (equals "N/A" or "-")
230+ """
231+ return not text or text .strip ().lower () in {"n/a" , "-" }
232+
233+ @staticmethod
234+ def _clean_social_input (text : str ) -> str | None :
235+ """
236+ Cleans the input string for social media URLs.
237+ Returns None if the input is blank or "N/A",
238+ removes prefixes like "LinkedIn: " or "GH: ",
239+ removes parameters like "?something=true",
240+ removes trailing slashes,
241+ removes "http://" or "https://",
242+ removes "www." prefix,
243+ removes "@" prefix,
244+ and decodes URL-encoded characters.
245+ """
246+ if EuroPythonSpeaker ._is_blank_or_na (text ):
247+ print (f"Blank or N/A input: { text } " )
248+ return None
249+
250+ text = text .strip ()
251+
252+ # Handle inputs like "LinkedIn: https://linkedin.com/in/username"
253+ # or "GH: https://github.com/username"
254+ text = text .split (" " , 1 )[1 ] if ": " in text else text
255+
256+ text = text .split ("?" , 1 )[0 ]
257+ text = text .split ("," , 1 )[0 ]
258+ text = text .rstrip ("/" )
140259
141260 if text .startswith ("https://" ):
142261 text = text [8 :]
@@ -150,19 +269,11 @@ def extract_bluesky_url(text: str) -> str:
150269 if text .startswith ("@" ):
151270 text = text [1 :]
152271
153- for marker in ("bsky.app/profile/" , "bsky/" ):
154- if marker in text :
155- text = text .split (marker , 1 )[1 ]
156- break
157- # case custom domain
158- else :
159- text = text .rsplit ("/" , 1 )[- 1 ]
160-
161- # if there's no dot, assume it's a non-custom handle and append '.bsky.social'
162- if "." not in text :
163- text += ".bsky.social"
272+ # Percent-encode non-ASCII characters
273+ if not text .isascii ():
274+ text = quote (text , safe = "@/-_.+~#=:" )
164275
165- return f"https://bsky.app/profile/ { text } "
276+ return text . lower ()
166277
167278
168279class EuroPythonSession (BaseModel ):
0 commit comments