Skip to content

Commit def1a6b

Browse files
committed
Improve socials extraction and testing
1 parent ef24b8d commit def1a6b

File tree

8 files changed

+562
-141
lines changed

8 files changed

+562
-141
lines changed

data/examples/europython/speakers.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
"submissions": ["A8CD3F"],
99
"affiliation": "A Company",
1010
"homepage": null,
11-
"gitx": "https://github.com/F3DC8A",
12-
"linkedin_url": "https://www.linkedin.com/in/F3DC8A",
11+
"gitx_url": "https://github.com/f3dc8a",
12+
"linkedin_url": "https://linkedin.com/in/f3dc8a",
1313
"bluesky_url": "https://bsky.app/profile/username.bsky.social",
1414
"mastodon_url": null,
1515
"twitter_url": null,

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ dependencies = [
1919

2020
[dependency-groups]
2121
dev = [
22+
"hypothesis>=6.131",
2223
"pre-commit>=4.2",
2324
"pytest>=8.3.5",
2425
"ruff>=0.11.4",

src/models/europython.py

Lines changed: 179 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from __future__ import annotations
22

3+
import re
34
from datetime import date, datetime
5+
from urllib.parse import quote
46

57
from pydantic import BaseModel, Field, computed_field, field_validator, model_validator
68

@@ -29,7 +31,7 @@ class EuroPythonSpeaker(BaseModel):
2931
mastodon_url: str | None = None
3032
linkedin_url: str | None = None
3133
bluesky_url: str | None = None
32-
gitx: str | None = None
34+
gitx_url: str | None = None
3335

3436
@computed_field
3537
def website_url(self) -> str:
@@ -50,93 +52,210 @@ def extract_answers(cls, values) -> dict:
5052
values["homepage"] = answer.answer_text
5153

5254
if answer.question_text == SpeakerQuestion.twitter:
53-
values["twitter_url"] = cls.extract_twitter_url(
54-
answer.answer_text.strip().split()[0]
55-
)
55+
values["twitter_url"] = cls.extract_twitter_url(answer.answer_text)
5656

5757
if answer.question_text == SpeakerQuestion.mastodon:
58-
values["mastodon_url"] = cls.extract_mastodon_url(
59-
answer.answer_text.strip().split()[0]
60-
)
58+
values["mastodon_url"] = cls.extract_mastodon_url(answer.answer_text)
6159

6260
if answer.question_text == SpeakerQuestion.bluesky:
63-
values["bluesky_url"] = cls.extract_bluesky_url(
64-
answer.answer_text.strip().split()[0]
65-
)
61+
values["bluesky_url"] = cls.extract_bluesky_url(answer.answer_text)
6662

6763
if answer.question_text == SpeakerQuestion.linkedin:
68-
values["linkedin_url"] = cls.extract_linkedin_url(
69-
answer.answer_text.strip().split()[0]
70-
)
64+
values["linkedin_url"] = cls.extract_linkedin_url(answer.answer_text)
7165

7266
if answer.question_text == SpeakerQuestion.gitx:
73-
values["gitx"] = answer.answer_text.strip().split()[0]
67+
values["gitx_url"] = cls.extract_gitx_url(answer.answer_text)
7468

7569
return values
7670

7771
@staticmethod
78-
def extract_twitter_url(text: str) -> str:
72+
def extract_twitter_url(text: str) -> str | None:
7973
"""
80-
Extract the Twitter URL from the answer
74+
Extracts a Twitter profile URL from the given text.
75+
Cleans the input and handles following formats:
76+
- @username
77+
- username
78+
- twitter.com/username
79+
- x.com/username
8180
"""
82-
if text.startswith("@"):
83-
twitter_url = f"https://x.com/{text[1:]}"
84-
elif not text.startswith(("https://", "http://", "www.")):
85-
twitter_url = f"https://x.com/{text}"
86-
else:
87-
twitter_url = (
88-
f"https://{text.removeprefix('https://').removeprefix('http://')}"
89-
)
81+
cleaned = EuroPythonSpeaker._clean_social_input(text)
82+
if cleaned is None:
83+
print(f"Invalid Twitter URL: {text}")
84+
return None
9085

91-
return twitter_url.split("?")[0]
86+
# https://twitter.com/username (username max 15 chars)
87+
match = re.match(r"^(twitter\.com|x\.com)/([\w]{1,15})$", cleaned)
88+
if match:
89+
_, username = match.groups()
90+
return f"https://x.com/{username}"
91+
92+
# only username
93+
if re.match(r"^[\w]{1,15}$", cleaned):
94+
return f"https://x.com/{cleaned}"
95+
96+
print(f"Invalid Twitter URL: {cleaned}")
97+
return None
9298

9399
@staticmethod
94-
def extract_mastodon_url(text: str) -> None | str:
100+
def extract_mastodon_url(text: str) -> str | None:
95101
"""
96-
Normalize Mastodon handle or URL to the format: https://<instance>/@<username>
102+
Extracts a Mastodon profile URL from the given text.
103+
Supports formats like:
104+
- @username@instance
105+
- username@instance
106+
- instance/@username
107+
- instance/@username@instance (with redirect)
108+
Returns: https://<instance>/@<username>
97109
"""
98-
text = text.strip().split("?", 1)[0]
99-
100-
# Handle @username@instance or username@instance formats
101-
if "@" in text and not text.startswith("http"):
102-
parts = text.split("@")
103-
if len(parts) == 3: # @username@instance
104-
_, username, instance = parts
105-
elif len(parts) == 2: # username@instance
106-
username, instance = parts
107-
else:
108-
return None
110+
cleaned = EuroPythonSpeaker._clean_social_input(text)
111+
if not cleaned:
112+
print(f"Invalid Mastodon URL: {text}")
113+
return None
114+
115+
# instance/@username
116+
match = re.match(r"^([\w\.-]+)/@([\w\.-]+)$", cleaned)
117+
if match:
118+
instance, username = match.groups()
109119
return f"https://{instance}/@{username}"
110120

111-
# Handle full URLs
112-
if text.startswith("http://"):
113-
text = "https://" + text[len("http://") :]
121+
parts = cleaned.split("@")
122+
if len(parts) == 3: # instance@username@instance
123+
_, username, instance = parts
124+
elif len(parts) == 2: # username@instance
125+
username, instance = parts
126+
else:
127+
print(f"Invalid Mastodon URL: {cleaned}")
128+
return None
129+
130+
if username and instance:
131+
return f"https://{instance}/@{username}"
114132

115-
return text
133+
print(f"Invalid Mastodon URL: {cleaned}")
134+
return None
116135

117136
@staticmethod
118-
def extract_linkedin_url(text: str) -> str:
137+
def extract_linkedin_url(text: str) -> str | None:
119138
"""
120-
Extract the LinkedIn URL from the answer
139+
Extracts a LinkedIn personal profile URL from the given text.
140+
Cleans the input and handles formats like:
141+
- username
142+
- linkedin.com/in/username
143+
- @username
144+
- tr.linkedin.com/in/username (country subdomains)
121145
"""
122-
if text.startswith("in/"):
123-
linkedin_url = f"https://linkedin.com/{text}"
124-
elif not text.startswith(("https://", "http://", "www.", "linkedin.")):
125-
linkedin_url = f"https://linkedin.com/in/{text}"
146+
cleaned = EuroPythonSpeaker._clean_social_input(text)
147+
if cleaned is None:
148+
print(f"Invalid LinkedIn URL: {text}")
149+
return None
150+
151+
if cleaned.startswith("in/"):
152+
linkedin_url = f"https://linkedin.com/{cleaned}"
153+
elif not cleaned.startswith(("linkedin.", "in/")) and "." not in cleaned:
154+
linkedin_url = f"https://linkedin.com/in/{cleaned}"
126155
else:
127-
linkedin_url = (
128-
f"https://{text.removeprefix('https://').removeprefix('http://')}"
129-
)
156+
linkedin_url = f"https://{cleaned}"
157+
158+
if not re.match(
159+
r"^https://([\w-]+\.)?linkedin\.com/in/(?:[\w\-]|%[0-9A-Fa-f]{2})+$",
160+
linkedin_url,
161+
):
162+
print(f"Invalid LinkedIn URL: {linkedin_url}")
163+
return None
130164

131-
return linkedin_url.split("?")[0]
165+
return linkedin_url
132166

133167
@staticmethod
134-
def extract_bluesky_url(text: str) -> str:
168+
def extract_bluesky_url(text: str) -> str | None:
135169
"""
136-
Returns a normalized BlueSky URL in the form https://bsky.app/profile/<USERNAME>.bsky.social,
137-
or uses the entire domain if it's custom (e.g., .dev).
170+
Extracts a Bluesky profile URL from the given text.
171+
Cleans the input and handles formats like:
172+
- username
173+
- bsky.app/profile/username
174+
- bsky/username
175+
- username.dev
176+
- @username
177+
- username.bsky.social
138178
"""
139-
text = text.strip().split("?", 1)[0]
179+
cleaned = EuroPythonSpeaker._clean_social_input(text)
180+
if cleaned is None:
181+
print(f"Invalid Bluesky URL: {text}")
182+
return None
183+
184+
for marker in ("bsky.app/profile/", "bsky/"):
185+
if marker in cleaned:
186+
cleaned = cleaned.split(marker, 1)[1]
187+
break
188+
else:
189+
cleaned = cleaned.rsplit("/", 1)[-1]
190+
191+
if "." not in cleaned:
192+
cleaned += ".bsky.social"
193+
194+
bluesky_url = f"https://bsky.app/profile/{cleaned}"
195+
196+
if not re.match(r"^https://bsky\.app/profile/[\w\.-]+\.[\w\.-]+$", bluesky_url):
197+
print(f"Invalid Bluesky URL: {bluesky_url}")
198+
return None
199+
200+
return bluesky_url
201+
202+
@staticmethod
203+
def extract_gitx_url(text: str) -> str | None:
204+
"""
205+
Extracts a GitHub/GitLab URL from the given text.
206+
Cleans the input and handles formats like:
207+
- username
208+
- github.com/username
209+
- gitlab.com/username
210+
- @username
211+
"""
212+
cleaned = EuroPythonSpeaker._clean_social_input(text)
213+
if cleaned is None:
214+
print(f"Invalid GitHub/GitLab URL: {text}")
215+
return None
216+
217+
if cleaned.startswith(("github.com/", "gitlab.com/")):
218+
return f"https://{cleaned}"
219+
220+
if re.match(r"^[\w-]+$", cleaned): # assume github.com
221+
return f"https://github.com/{cleaned}"
222+
223+
print(f"Invalid GitHub/GitLab URL: {cleaned}")
224+
return None
225+
226+
@staticmethod
227+
def _is_blank_or_na(text: str) -> bool:
228+
"""
229+
Check if the text is blank or (equals "N/A" or "-")
230+
"""
231+
return not text or text.strip().lower() in {"n/a", "-"}
232+
233+
@staticmethod
234+
def _clean_social_input(text: str) -> str | None:
235+
"""
236+
Cleans the input string for social media URLs.
237+
Returns None if the input is blank or "N/A",
238+
removes prefixes like "LinkedIn: " or "GH: ",
239+
removes parameters like "?something=true",
240+
removes trailing slashes,
241+
removes "http://" or "https://",
242+
removes "www." prefix,
243+
removes "@" prefix,
244+
and decodes URL-encoded characters.
245+
"""
246+
if EuroPythonSpeaker._is_blank_or_na(text):
247+
print(f"Blank or N/A input: {text}")
248+
return None
249+
250+
text = text.strip()
251+
252+
# Handle inputs like "LinkedIn: https://linkedin.com/in/username"
253+
# or "GH: https://github.com/username"
254+
text = text.split(" ", 1)[1] if ": " in text else text
255+
256+
text = text.split("?", 1)[0]
257+
text = text.split(",", 1)[0]
258+
text = text.rstrip("/")
140259

141260
if text.startswith("https://"):
142261
text = text[8:]
@@ -150,19 +269,11 @@ def extract_bluesky_url(text: str) -> str:
150269
if text.startswith("@"):
151270
text = text[1:]
152271

153-
for marker in ("bsky.app/profile/", "bsky/"):
154-
if marker in text:
155-
text = text.split(marker, 1)[1]
156-
break
157-
# case custom domain
158-
else:
159-
text = text.rsplit("/", 1)[-1]
160-
161-
# if there's no dot, assume it's a non-custom handle and append '.bsky.social'
162-
if "." not in text:
163-
text += ".bsky.social"
272+
# Percent-encode non-ASCII characters
273+
if not text.isascii():
274+
text = quote(text, safe="@/-_.+~#=:")
164275

165-
return f"https://bsky.app/profile/{text}"
276+
return text.lower()
166277

167278

168279
class EuroPythonSession(BaseModel):

0 commit comments

Comments
 (0)