Skip to content

Commit 05a85cf

Browse files
authored
Create scrape_workshops.py
1 parent 30016d7 commit 05a85cf

File tree

1 file changed

+200
-0
lines changed

1 file changed

+200
-0
lines changed
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Scrape upcoming workshops from D-Lab EventSpark page
4+
"""
5+
6+
import json
7+
import requests
8+
from bs4 import BeautifulSoup
9+
from datetime import datetime, timezone
10+
import pytz
11+
import re
12+
import os
13+
14+
# URL to scrape
15+
EVENTS_URL = "https://dlab.my.salesforce-sites.com/events"
16+
17+
# Output file
18+
OUTPUT_FILE = "_data/upcoming_workshops.json"
19+
20+
def parse_datetime_string(date_str, time_str):
21+
"""Parse date and time strings into ISO format"""
22+
try:
23+
# Example: "Aug 19, 2025" and "10:00 AM - 12:00 PM (GMT-7:00)"
24+
# Extract start time
25+
time_match = re.match(r'(\d{1,2}:\d{2}\s*[AP]M)', time_str)
26+
if time_match:
27+
start_time = time_match.group(1)
28+
# Combine date and time
29+
datetime_str = f"{date_str} {start_time}"
30+
# Parse with timezone awareness
31+
dt = datetime.strptime(datetime_str, "%b %d, %Y %I:%M %p")
32+
# Assume Pacific Time
33+
pacific = pytz.timezone('America/Los_Angeles')
34+
dt = pacific.localize(dt)
35+
return dt.isoformat()
36+
except Exception as e:
37+
print(f"Error parsing datetime: {e}")
38+
return None
39+
40+
41+
def extract_workshop_info(workshop_div):
42+
"""Extract workshop information from a div element"""
43+
try:
44+
# Find the title - it's typically in a heading or strong element
45+
title_elem = workshop_div.find(['h2', 'h3', 'h4', 'strong'])
46+
if not title_elem:
47+
# Try finding by class or other methods
48+
title_elem = workshop_div.find(class_=re.compile('title|heading', re.I))
49+
50+
title = title_elem.text.strip() if title_elem else "Unknown Workshop"
51+
52+
# Find date and time - look for calendar icon or date pattern
53+
date_text = ""
54+
time_text = ""
55+
56+
# Look for date patterns
57+
text_content = workshop_div.get_text()
58+
date_match = re.search(r'([A-Za-z]{3}\s+\d{1,2},\s+\d{4})', text_content)
59+
if date_match:
60+
date_text = date_match.group(1)
61+
62+
# Look for time patterns
63+
time_match = re.search(r'(\d{1,2}:\d{2}\s*[AP]M\s*-\s*\d{1,2}:\d{2}\s*[AP]M\s*\([^)]+\))', text_content)
64+
if time_match:
65+
time_text = time_match.group(1)
66+
67+
# Find registration link - look for links containing /events/event/home/
68+
register_link = None
69+
links = workshop_div.find_all('a', href=True)
70+
for link in links:
71+
href = link['href']
72+
# Look for EventSpark event links
73+
if '/events/event/home/' in href or 'event/home/' in href:
74+
if not href.startswith('http'):
75+
href = f"https://dlab.my.salesforce-sites.com{href}"
76+
register_link = href
77+
break
78+
# Also check for register/signup type links
79+
elif any(term in href.lower() for term in ['register', 'signup', 'book']):
80+
if not href.startswith('http'):
81+
href = f"https://dlab.my.salesforce-sites.com{href}"
82+
register_link = href
83+
84+
# Parse datetime
85+
datetime_iso = None
86+
if date_text and time_text:
87+
datetime_iso = parse_datetime_string(date_text, time_text)
88+
89+
return {
90+
"title": title,
91+
"date": date_text,
92+
"time": time_text,
93+
"datetime_iso": datetime_iso,
94+
"registration_url": register_link,
95+
"scraped_at": datetime.now(timezone.utc).isoformat()
96+
}
97+
98+
except Exception as e:
99+
print(f"Error extracting workshop info: {e}")
100+
return None
101+
102+
103+
def scrape_workshops():
104+
"""Main function to scrape workshops"""
105+
print(f"Fetching workshops from {EVENTS_URL}")
106+
107+
try:
108+
response = requests.get(EVENTS_URL, timeout=30)
109+
response.raise_for_status()
110+
111+
soup = BeautifulSoup(response.text, 'html.parser')
112+
113+
# Find workshop containers
114+
# Based on the screenshot, workshops seem to be in divs with specific structure
115+
workshops = []
116+
117+
# Look for workshop containers based on EventSpark structure
118+
# First try to find by class names that might contain event info
119+
workshop_containers = []
120+
121+
# Try finding containers with links to /events/event/home/
122+
for link in soup.find_all('a', href=re.compile(r'/events/event/home/')):
123+
# Get the parent container that likely holds all workshop info
124+
parent = link.find_parent('div')
125+
if parent and parent not in workshop_containers:
126+
# Look for a container that has both title and date
127+
container = parent
128+
# Keep going up until we find a reasonable container
129+
while container and container.parent:
130+
text = container.get_text()
131+
if (re.search(r'[A-Za-z]{3}\s+\d{1,2},\s+\d{4}', text) and
132+
any(keyword in text.upper() for keyword in ['FUNDAMENTALS', 'WORKSHOP', 'PYTHON', 'R ', 'DATA', 'MACHINE','API'])):
133+
workshop_containers.append(container)
134+
break
135+
container = container.parent
136+
137+
# Also try standard class-based search
138+
if not workshop_containers:
139+
workshop_containers = soup.find_all('div', class_=re.compile('workshop|event|session', re.I))
140+
141+
# If still nothing, look for divs containing workshop patterns
142+
if not workshop_containers:
143+
for div in soup.find_all('div'):
144+
text = div.get_text()
145+
if (any(keyword in text.upper() for keyword in ['FUNDAMENTALS', 'WORKSHOP', 'PYTHON', 'R ', 'DATA']) and
146+
re.search(r'[A-Za-z]{3}\s+\d{1,2},\s+\d{4}', text) and
147+
div.find('a', href=True)): # Has a link
148+
workshop_containers.append(div)
149+
150+
print(f"Found {len(workshop_containers)} potential workshop containers")
151+
152+
for container in workshop_containers:
153+
workshop_info = extract_workshop_info(container)
154+
if workshop_info and workshop_info['title'] != "Unknown Workshop":
155+
workshops.append(workshop_info)
156+
print(f"Extracted: {workshop_info['title']}")
157+
158+
# Sort by date
159+
workshops.sort(key=lambda x: x['datetime_iso'] or '')
160+
161+
# Filter out past workshops
162+
current_time = datetime.now(timezone.utc)
163+
upcoming_workshops = []
164+
for workshop in workshops:
165+
if workshop['datetime_iso']:
166+
workshop_time = datetime.fromisoformat(workshop['datetime_iso'])
167+
if workshop_time > current_time:
168+
upcoming_workshops.append(workshop)
169+
170+
# Prepare output
171+
output = {
172+
"last_updated": datetime.now(timezone.utc).isoformat(),
173+
"total_workshops": len(upcoming_workshops),
174+
"workshops": upcoming_workshops
175+
}
176+
177+
# Ensure directory exists
178+
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
179+
180+
# Write to file
181+
with open(OUTPUT_FILE, 'w') as f:
182+
json.dump(output, f, indent=2)
183+
184+
print(f"Successfully saved {len(upcoming_workshops)} upcoming workshops to {OUTPUT_FILE}")
185+
186+
except Exception as e:
187+
print(f"Error scraping workshops: {e}")
188+
# Create empty file to prevent build errors
189+
output = {
190+
"last_updated": datetime.now(timezone.utc).isoformat(),
191+
"total_workshops": 0,
192+
"workshops": [],
193+
"error": str(e)
194+
}
195+
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
196+
with open(OUTPUT_FILE, 'w') as f:
197+
json.dump(output, f, indent=2)
198+
199+
if __name__ == "__main__":
200+
scrape_workshops()

0 commit comments

Comments
 (0)