1
1
from __future__ import annotations
2
2
3
+ import re
3
4
from datetime import date , datetime
5
+ from urllib .parse import quote
4
6
5
7
from pydantic import BaseModel , Field , computed_field , field_validator , model_validator
6
8
@@ -29,7 +31,7 @@ class EuroPythonSpeaker(BaseModel):
29
31
mastodon_url : str | None = None
30
32
linkedin_url : str | None = None
31
33
bluesky_url : str | None = None
32
- gitx : str | None = None
34
+ gitx_url : str | None = None
33
35
34
36
@computed_field
35
37
def website_url (self ) -> str :
@@ -50,93 +52,210 @@ def extract_answers(cls, values) -> dict:
50
52
values ["homepage" ] = answer .answer_text
51
53
52
54
if answer .question_text == SpeakerQuestion .twitter :
53
- values ["twitter_url" ] = cls .extract_twitter_url (
54
- answer .answer_text .strip ().split ()[0 ]
55
- )
55
+ values ["twitter_url" ] = cls .extract_twitter_url (answer .answer_text )
56
56
57
57
if answer .question_text == SpeakerQuestion .mastodon :
58
- values ["mastodon_url" ] = cls .extract_mastodon_url (
59
- answer .answer_text .strip ().split ()[0 ]
60
- )
58
+ values ["mastodon_url" ] = cls .extract_mastodon_url (answer .answer_text )
61
59
62
60
if answer .question_text == SpeakerQuestion .bluesky :
63
- values ["bluesky_url" ] = cls .extract_bluesky_url (
64
- answer .answer_text .strip ().split ()[0 ]
65
- )
61
+ values ["bluesky_url" ] = cls .extract_bluesky_url (answer .answer_text )
66
62
67
63
if answer .question_text == SpeakerQuestion .linkedin :
68
- values ["linkedin_url" ] = cls .extract_linkedin_url (
69
- answer .answer_text .strip ().split ()[0 ]
70
- )
64
+ values ["linkedin_url" ] = cls .extract_linkedin_url (answer .answer_text )
71
65
72
66
if answer .question_text == SpeakerQuestion .gitx :
73
- values ["gitx " ] = answer .answer_text . strip (). split ()[ 0 ]
67
+ values ["gitx_url " ] = cls . extract_gitx_url ( answer .answer_text )
74
68
75
69
return values
76
70
77
71
@staticmethod
78
- def extract_twitter_url (text : str ) -> str :
72
+ def extract_twitter_url (text : str ) -> str | None :
79
73
"""
80
- Extract the Twitter URL from the answer
74
+ Extracts a Twitter profile URL from the given text.
75
+ Cleans the input and handles following formats:
76
+ - @username
77
+ - username
78
+ - twitter.com/username
79
+ - x.com/username
81
80
"""
82
- if text .startswith ("@" ):
83
- twitter_url = f"https://x.com/{ text [1 :]} "
84
- elif not text .startswith (("https://" , "http://" , "www." )):
85
- twitter_url = f"https://x.com/{ text } "
86
- else :
87
- twitter_url = (
88
- f"https://{ text .removeprefix ('https://' ).removeprefix ('http://' )} "
89
- )
81
+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
82
+ if cleaned is None :
83
+ print (f"Invalid Twitter URL: { text } " )
84
+ return None
90
85
91
- return twitter_url .split ("?" )[0 ]
86
+ # https://twitter.com/username (username max 15 chars)
87
+ match = re .match (r"^(twitter\.com|x\.com)/([\w]{1,15})$" , cleaned )
88
+ if match :
89
+ _ , username = match .groups ()
90
+ return f"https://x.com/{ username } "
91
+
92
+ # only username
93
+ if re .match (r"^[\w]{1,15}$" , cleaned ):
94
+ return f"https://x.com/{ cleaned } "
95
+
96
+ print (f"Invalid Twitter URL: { cleaned } " )
97
+ return None
92
98
93
99
@staticmethod
94
- def extract_mastodon_url (text : str ) -> None | str :
100
+ def extract_mastodon_url (text : str ) -> str | None :
95
101
"""
96
- Normalize Mastodon handle or URL to the format: https://<instance>/@<username>
102
+ Extracts a Mastodon profile URL from the given text.
103
+ Supports formats like:
104
+ - @username@instance
105
+ - username@instance
106
+ - instance/@username
107
+ - instance/@username@instance (with redirect)
108
+ Returns: https://<instance>/@<username>
97
109
"""
98
- text = text .strip ().split ("?" , 1 )[0 ]
99
-
100
- # Handle @username@instance or username@instance formats
101
- if "@" in text and not text .startswith ("http" ):
102
- parts = text .split ("@" )
103
- if len (parts ) == 3 : # @username@instance
104
- _ , username , instance = parts
105
- elif len (parts ) == 2 : # username@instance
106
- username , instance = parts
107
- else :
108
- return None
110
+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
111
+ if not cleaned :
112
+ print (f"Invalid Mastodon URL: { text } " )
113
+ return None
114
+
115
+ # instance/@username
116
+ match = re .match (r"^([\w\.-]+)/@([\w\.-]+)$" , cleaned )
117
+ if match :
118
+ instance , username = match .groups ()
109
119
return f"https://{ instance } /@{ username } "
110
120
111
- # Handle full URLs
112
- if text .startswith ("http://" ):
113
- text = "https://" + text [len ("http://" ) :]
121
+ parts = cleaned .split ("@" )
122
+ if len (parts ) == 3 : # instance@username@instance
123
+ _ , username , instance = parts
124
+ elif len (parts ) == 2 : # username@instance
125
+ username , instance = parts
126
+ else :
127
+ print (f"Invalid Mastodon URL: { cleaned } " )
128
+ return None
129
+
130
+ if username and instance :
131
+ return f"https://{ instance } /@{ username } "
114
132
115
- return text
133
+ print (f"Invalid Mastodon URL: { cleaned } " )
134
+ return None
116
135
117
136
@staticmethod
118
- def extract_linkedin_url (text : str ) -> str :
137
+ def extract_linkedin_url (text : str ) -> str | None :
119
138
"""
120
- Extract the LinkedIn URL from the answer
139
+ Extracts a LinkedIn personal profile URL from the given text.
140
+ Cleans the input and handles formats like:
141
+ - username
142
+ - linkedin.com/in/username
143
+ - @username
144
+ - tr.linkedin.com/in/username (country subdomains)
121
145
"""
122
- if text .startswith ("in/" ):
123
- linkedin_url = f"https://linkedin.com/{ text } "
124
- elif not text .startswith (("https://" , "http://" , "www." , "linkedin." )):
125
- linkedin_url = f"https://linkedin.com/in/{ text } "
146
+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
147
+ if cleaned is None :
148
+ print (f"Invalid LinkedIn URL: { text } " )
149
+ return None
150
+
151
+ if cleaned .startswith ("in/" ):
152
+ linkedin_url = f"https://linkedin.com/{ cleaned } "
153
+ elif not cleaned .startswith (("linkedin." , "in/" )) and "." not in cleaned :
154
+ linkedin_url = f"https://linkedin.com/in/{ cleaned } "
126
155
else :
127
- linkedin_url = (
128
- f"https://{ text .removeprefix ('https://' ).removeprefix ('http://' )} "
129
- )
156
+ linkedin_url = f"https://{ cleaned } "
157
+
158
+ if not re .match (
159
+ r"^https://([\w-]+\.)?linkedin\.com/in/(?:[\w\-]|%[0-9A-Fa-f]{2})+$" ,
160
+ linkedin_url ,
161
+ ):
162
+ print (f"Invalid LinkedIn URL: { linkedin_url } " )
163
+ return None
130
164
131
- return linkedin_url . split ( "?" )[ 0 ]
165
+ return linkedin_url
132
166
133
167
@staticmethod
134
- def extract_bluesky_url (text : str ) -> str :
168
+ def extract_bluesky_url (text : str ) -> str | None :
135
169
"""
136
- Returns a normalized BlueSky URL in the form https://bsky.app/profile/<USERNAME>.bsky.social,
137
- or uses the entire domain if it's custom (e.g., .dev).
170
+ Extracts a Bluesky profile URL from the given text.
171
+ Cleans the input and handles formats like:
172
+ - username
173
+ - bsky.app/profile/username
174
+ - bsky/username
175
+ - username.dev
176
+ - @username
177
+ - username.bsky.social
138
178
"""
139
- text = text .strip ().split ("?" , 1 )[0 ]
179
+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
180
+ if cleaned is None :
181
+ print (f"Invalid Bluesky URL: { text } " )
182
+ return None
183
+
184
+ for marker in ("bsky.app/profile/" , "bsky/" ):
185
+ if marker in cleaned :
186
+ cleaned = cleaned .split (marker , 1 )[1 ]
187
+ break
188
+ else :
189
+ cleaned = cleaned .rsplit ("/" , 1 )[- 1 ]
190
+
191
+ if "." not in cleaned :
192
+ cleaned += ".bsky.social"
193
+
194
+ bluesky_url = f"https://bsky.app/profile/{ cleaned } "
195
+
196
+ if not re .match (r"^https://bsky\.app/profile/[\w\.-]+\.[\w\.-]+$" , bluesky_url ):
197
+ print (f"Invalid Bluesky URL: { bluesky_url } " )
198
+ return None
199
+
200
+ return bluesky_url
201
+
202
+ @staticmethod
203
+ def extract_gitx_url (text : str ) -> str | None :
204
+ """
205
+ Extracts a GitHub/GitLab URL from the given text.
206
+ Cleans the input and handles formats like:
207
+ - username
208
+ - github.com/username
209
+ - gitlab.com/username
210
+ - @username
211
+ """
212
+ cleaned = EuroPythonSpeaker ._clean_social_input (text )
213
+ if cleaned is None :
214
+ print (f"Invalid GitHub/GitLab URL: { text } " )
215
+ return None
216
+
217
+ if cleaned .startswith (("github.com/" , "gitlab.com/" )):
218
+ return f"https://{ cleaned } "
219
+
220
+ if re .match (r"^[\w-]+$" , cleaned ): # assume github.com
221
+ return f"https://github.com/{ cleaned } "
222
+
223
+ print (f"Invalid GitHub/GitLab URL: { cleaned } " )
224
+ return None
225
+
226
+ @staticmethod
227
+ def _is_blank_or_na (text : str ) -> bool :
228
+ """
229
+ Check if the text is blank or (equals "N/A" or "-")
230
+ """
231
+ return not text or text .strip ().lower () in {"n/a" , "-" }
232
+
233
+ @staticmethod
234
+ def _clean_social_input (text : str ) -> str | None :
235
+ """
236
+ Cleans the input string for social media URLs.
237
+ Returns None if the input is blank or "N/A",
238
+ removes prefixes like "LinkedIn: " or "GH: ",
239
+ removes parameters like "?something=true",
240
+ removes trailing slashes,
241
+ removes "http://" or "https://",
242
+ removes "www." prefix,
243
+ removes "@" prefix,
244
+ and decodes URL-encoded characters.
245
+ """
246
+ if EuroPythonSpeaker ._is_blank_or_na (text ):
247
+ print (f"Blank or N/A input: { text } " )
248
+ return None
249
+
250
+ text = text .strip ()
251
+
252
+ # Handle inputs like "LinkedIn: https://linkedin.com/in/username"
253
+ # or "GH: https://github.com/username"
254
+ text = text .split (" " , 1 )[1 ] if ": " in text else text
255
+
256
+ text = text .split ("?" , 1 )[0 ]
257
+ text = text .split ("," , 1 )[0 ]
258
+ text = text .rstrip ("/" )
140
259
141
260
if text .startswith ("https://" ):
142
261
text = text [8 :]
@@ -150,19 +269,11 @@ def extract_bluesky_url(text: str) -> str:
150
269
if text .startswith ("@" ):
151
270
text = text [1 :]
152
271
153
- for marker in ("bsky.app/profile/" , "bsky/" ):
154
- if marker in text :
155
- text = text .split (marker , 1 )[1 ]
156
- break
157
- # case custom domain
158
- else :
159
- text = text .rsplit ("/" , 1 )[- 1 ]
160
-
161
- # if there's no dot, assume it's a non-custom handle and append '.bsky.social'
162
- if "." not in text :
163
- text += ".bsky.social"
272
+ # Percent-encode non-ASCII characters
273
+ if not text .isascii ():
274
+ text = quote (text , safe = "@/-_.+~#=:" )
164
275
165
- return f"https://bsky.app/profile/ { text } "
276
+ return text . lower ()
166
277
167
278
168
279
class EuroPythonSession (BaseModel ):
0 commit comments