Skip to content

Commit 5e48838

Browse files
jophy-yedtemkin1
andauthored
Parse Course 6 Special Subjects (#292)
Closes #288. ## Features * Description, units, instructors in charge, etc. * Time slots of all lectures and/or recitations with location * URL to the correction section of the [EECS special subjects page](https://www.eecs.mit.edu/academics/subject-updates/subject-updates-spring-2026/) * "Same as" ## Issues Remaining - [x] 6.S976 has a schedule but it cannot be correct shown on Hydrant somehow. 6.S898 has the same class time but it is shown. I actually have no idea why. Help is much appreciated. - [x] Linting ig --------- Co-authored-by: Diego Temkin <65834932+dtemkin1@users.noreply.github.com>
1 parent ce9c011 commit 5e48838

File tree

5 files changed

+614
-2
lines changed

5 files changed

+614
-2
lines changed

scrapers/departments/__init__.py

Whitespace-only changes.
Lines changed: 366 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,366 @@
1+
"""
2+
Temporary helper to parse EECS Subject Updates (Spring 2026).
3+
Intended to help generate override data for Course 6 special subjects.
4+
5+
Imitates the structure of math_dept.py: scrape a departmental page, parse rows,
6+
and return a dict of overrides.
7+
8+
Functions:
9+
* get_rows()
10+
* parse_schedule(schedule_line)
11+
* parse_header(text)
12+
* parse_many_timeslots(days, slot, is_pm_int)
13+
* make_raw_sections(days, slot, room, is_pm_int)
14+
* parse_row(row)
15+
* run()
16+
"""
17+
18+
from __future__ import annotations
19+
20+
import re
21+
from pprint import pprint
22+
from typing import Any, Dict, List, Literal, Optional, Tuple
23+
from urllib.request import Request, urlopen
24+
25+
from bs4 import BeautifulSoup, Tag
26+
27+
from scrapers.fireroad import parse_section, parse_timeslot
28+
from scrapers.utils import EVE_TIMES, TIMES
29+
30+
# The EECS WordPress page renders its subject list by dynamically loading this HTML
31+
# fragment (see network request `.../plugins/subj_2026SP.html` in a browser).
32+
# `requests.get()` of the WordPress page often returns only navigation chrome, so
33+
# this script scrapes the source-of-truth fragment directly.
34+
URL = "https://eecsis.mit.edu/plugins/subj_2026SP.html"
35+
FRONTEND_URL = (
36+
"https://www.eecs.mit.edu/academics/subject-updates/subject-updates-spring-2026/"
37+
)
38+
# Match a 6.S### subject header, optionally with an "(also ...)" clause.
39+
# Group 1: the 6.S### number
40+
# Group 2 (optional): comma-separated cross-list numbers (for the "same" field)
41+
# Group 3: the title text (excluding the "(also ...)" clause when present)
42+
COURSE_RE = re.compile(
43+
r"\b(6\.S\d{3})\b"
44+
r"(?:\s*\(\s*also(?: under)?\s+"
45+
r"([A-Za-z]{0,5}\d{0,3}[A-Za-z]{0,3}\.[A-Za-z]{0,3}\d{1,4}[A-Za-z]?"
46+
r"(?:\s*,\s*[A-Za-z]{0,5}\d{0,3}[A-Za-z]{0,3}\.[A-Za-z]{0,3}\d{1,4}[A-Za-z]?)*"
47+
r")\s*\)\s*)?"
48+
r"(.*)$",
49+
re.IGNORECASE,
50+
)
51+
DAY_WORD = {
52+
"monday": "M",
53+
"tuesday": "T",
54+
"wednesday": "W",
55+
"thursday": "R",
56+
"friday": "F",
57+
}
58+
59+
60+
Timeslot = Tuple[int, int]
61+
Section = Tuple[List[Timeslot], str]
62+
Units = Dict[
63+
Literal["lectureUnits", "labUnits", "preparationUnits", "isVariableUnits"], Any
64+
]
65+
RawSectionFields = Dict[str, List[str]]
66+
SectionFields = Dict[str, List[Section]]
67+
68+
69+
def normalize_days(days_raw: str) -> str:
70+
"""
71+
Normalize day strings into Fireroad-compatible day letters (MTWRF).
72+
73+
Examples:
74+
- "TR" -> "TR"
75+
- "Thursdays" -> "R"
76+
"""
77+
assert days_raw, "empty day string"
78+
79+
if days_raw.isupper():
80+
assert set(days_raw) <= set("MTWRF"), days_raw
81+
return days_raw
82+
83+
key = days_raw.lower().rstrip("s")
84+
assert key in DAY_WORD, days_raw
85+
return DAY_WORD[key]
86+
87+
88+
def parse_many_timeslots(days: str, slot: str, is_pm_int: int) -> list[Timeslot]:
89+
"""
90+
Parses many timeslots.
91+
92+
Args:
93+
* days (str): A list of days (e.g. "TR")
94+
* slot (str): The timeslot (e.g. "1-2.30" or "7-10 PM")
95+
* is_pm_int (int): 0 for AM-ish slots, 1 for PM-ish slots
96+
97+
Returns:
98+
* list[Timeslot]: All parsed timeslots
99+
"""
100+
assert is_pm_int in (0, 1), is_pm_int
101+
return [parse_timeslot(day, slot, bool(is_pm_int)) for day in days]
102+
103+
104+
def make_raw_sections(days: str, slot: str, room: str, is_pm_int: int) -> str:
105+
"""
106+
Formats a raw section (same shape as math_dept.py).
107+
"""
108+
assert days
109+
assert slot
110+
assert room
111+
assert is_pm_int in (0, 1), is_pm_int
112+
113+
return f"{room}/{days}/{is_pm_int}/{slot}"
114+
115+
116+
def parse_schedule(
117+
schedule_line: str,
118+
) -> tuple[RawSectionFields, SectionFields, list[str]]:
119+
"""
120+
Parse a schedule value like:
121+
"Lectures: TR2:30-4, room 34-101"
122+
"Lecture: MW1-2:30, room 32-155; Recitations: Tuesdays 2-3p, room 36-112"
123+
"Lectures: Thursdays 7-10pm, room 2-131"
124+
125+
Args:
126+
* schedule_line (str): The raw schedule line
127+
128+
Returns:
129+
* (RawSectionFields, SectionFields, list[str]):
130+
- RawSectionFields: mapping like "lectureRawSections" -> list[str]
131+
- SectionFields: mapping like "lectureSections" -> list[Section]
132+
133+
Both dicts are intended to be merged into the per-course `data` dict in
134+
`parse_row()` via `data.update(...)`.
135+
"""
136+
assert schedule_line and schedule_line != "TBD", schedule_line
137+
138+
chunks = list(filter(None, schedule_line.split(";")))
139+
raw_fields: RawSectionFields = {}
140+
section_fields: SectionFields = {}
141+
kinds: list[str] = []
142+
143+
for idx, chunk in enumerate(chunks):
144+
m = re.match(
145+
r"^(?:(?P<kind>Lectures?|Lecture|Recitations?|Recitation|Labs?|Lab|"
146+
r"Designs?|Design):\s*)?"
147+
r"(?P<days>(?:[MTWRF]+)|(?:Monday|Tuesday|Wednesday|Thursday|Friday|"
148+
r"Mondays|Tuesdays|Wednesdays|Thursdays|Fridays))\s*"
149+
r"(?P<start>[0-9]+(?:[.:][0-9]{2})?)"
150+
r"(?:\s*(?P<start_ampm>am|pm|a|p))?\s*-\s*"
151+
r"(?P<end>[0-9]+(?:[.:][0-9]{2})?)"
152+
r"(?:\s*(?P<end_ampm>am|pm|a|p))?\s*,\s*room\s+"
153+
r"(?P<room>[A-Za-z0-9-]+)(?:\s+.*)?$",
154+
chunk.strip(),
155+
re.IGNORECASE,
156+
)
157+
assert m is not None, chunk
158+
159+
kind = "lecture"
160+
if m.group("kind") is not None:
161+
kind = m.group("kind").lower().rstrip("s") # drop 's' in e.g. Lectures
162+
else:
163+
assert idx == 0, "Only the first chunk may omit its kind (assumed lecture)"
164+
165+
start = m.group("start").replace(":", ".")
166+
end = m.group("end").replace(":", ".")
167+
168+
is_day = start in TIMES and end in TIMES
169+
is_eve = start in EVE_TIMES and end in EVE_TIMES
170+
assert is_day or is_eve, (start, end)
171+
is_pm_int = 0 if is_day else 1
172+
173+
raw = make_raw_sections(
174+
normalize_days(m.group("days")),
175+
f"{start}-{end}" + (" PM" if is_pm_int == 1 else ""),
176+
m.group("room"),
177+
is_pm_int,
178+
)
179+
raw_fields.setdefault(f"{kind}RawSections", []).append(raw)
180+
section_fields.setdefault(f"{kind}Sections", []).append(parse_section(raw))
181+
kinds.append(kind)
182+
183+
return raw_fields, section_fields, kinds
184+
185+
186+
def get_rows() -> list[Tag]:
187+
"""
188+
Scrapes the EECS subject updates page and returns "rows", each representing
189+
one 6.S### entry as a list of text blocks (header + body).
190+
191+
Args: none
192+
193+
Returns:
194+
* list[Tag]: BeautifulSoup tags for each detected 6.S### subject
195+
"""
196+
request = Request(URL)
197+
request.add_unredirected_header(
198+
"User-Agent", "hydrant-scrapers (https://github.com/sipb/hydrant)"
199+
)
200+
201+
with urlopen(request, timeout=15) as response:
202+
page_html = response.read().decode("utf-8")
203+
204+
soup = BeautifulSoup(page_html, features="lxml")
205+
page_text = soup.get_text(" ", strip=True)
206+
assert COURSE_RE.search(page_text) is not None, f"No 6.S### entries found on {URL}"
207+
208+
# Each subject block begins with an h6 heading containing the subject number.
209+
rows = soup.find_all("h6")
210+
assert rows, "No <h6> course headings found"
211+
return rows
212+
213+
214+
def parse_header(text: str) -> tuple[str, str, Optional[str]]:
215+
"""
216+
Parse a header block containing a course number.
217+
218+
Returns:
219+
* tuple[str, str, str | None]: (course_number, title_fragment, same_csv)
220+
"""
221+
match = COURSE_RE.search(text)
222+
assert match
223+
course = match.group(1)
224+
same_as = match.group(2)
225+
title = match.group(3).lstrip(" :-–—\t").rstrip("§ ")
226+
return course, title, same_as
227+
228+
229+
def parse_units(units_str: str) -> Units:
230+
"""
231+
Parse units string like "3-0-9" or "12" into a dict suitable for `data.update(...)`.
232+
233+
Args:
234+
units_str (str): Units string from the webpage
235+
236+
Returns:
237+
Units:
238+
Dict with keys: lectureUnits, labUnits, preparationUnits, isVariableUnits.
239+
Raises ValueError if can't parse.
240+
"""
241+
# Parse formats like "3-0-9"
242+
if "-" in units_str:
243+
parts = units_str.split("-")
244+
if len(parts) == 3:
245+
return {
246+
"lectureUnits": int(parts[0]),
247+
"labUnits": int(parts[1]),
248+
"preparationUnits": int(parts[2]),
249+
"isVariableUnits": False,
250+
}
251+
raise ValueError(f"Invalid units string: {units_str}")
252+
253+
# Single number like "12" - variable units
254+
if units_str.isdigit():
255+
return {
256+
"lectureUnits": 0,
257+
"labUnits": 0,
258+
"preparationUnits": 0,
259+
"isVariableUnits": True,
260+
}
261+
262+
# Can't parse
263+
raise ValueError(f"Invalid units string: {units_str}")
264+
265+
266+
def parse_level(level_str: str) -> str:
267+
"""
268+
Parse level string to "U" or "G".
269+
270+
Args:
271+
level_str (str): Level string from the webpage
272+
273+
Returns:
274+
str: "U" for undergraduate, "G" for graduate
275+
"""
276+
# Note: might be both "undergraduate" and "graduate"
277+
level_str = level_str.lower()
278+
if "graduate" in level_str and "undergrad" not in level_str:
279+
return "G"
280+
return "U"
281+
282+
283+
def parse_row(row: Tag) -> dict[str, dict[str, Any]]:
284+
"""
285+
Parses a single row (one subject entry).
286+
287+
Args:
288+
* row (Tag): header + body blocks
289+
290+
Returns:
291+
* dict[str, dict[str, Any]]: A single-entry overrides dict
292+
"""
293+
header = row.get_text(" ", strip=True)
294+
course, title, same_as = parse_header(header)
295+
data = {"url": f'{FRONTEND_URL}#{course.replace(".", "_", 1)}'}
296+
297+
if title:
298+
data["name"] = title
299+
if same_as:
300+
data["same"] = same_as
301+
302+
# The fragment lays out each subject as:
303+
# <h6> ... </h6>
304+
# <hr/>
305+
# <table> key/value metadata </table>
306+
# <hr/>
307+
# <div> description ... </div>
308+
table = row.find_next_sibling("table")
309+
assert table is not None, f"Missing metadata table for {course}"
310+
311+
meta = {}
312+
for tr in table.find_all("tr"):
313+
tds = tr.find_all("td")
314+
if len(tds) != 2:
315+
continue
316+
key = tds[0].get_text(" ", strip=True).rstrip(":")
317+
val = tds[1].get_text(" ", strip=True)
318+
if key and val:
319+
meta[key] = val
320+
321+
# Parse Level (if present)
322+
if "Level" in meta:
323+
data["level"] = parse_level(meta["Level"])
324+
325+
# Parse Units (if present and parseable)
326+
if "Units" in meta:
327+
data.update(parse_units(meta["Units"]))
328+
329+
if "Instructors" in meta:
330+
data["inCharge"] = meta["Instructors"].replace("\n", ", ")
331+
332+
if "Prereqs" in meta:
333+
data["prereqs"] = meta["Prereqs"]
334+
335+
if "Schedule" in meta and meta["Schedule"] != "TBD":
336+
schedule_data = parse_schedule(meta["Schedule"])
337+
data.update(schedule_data[0])
338+
data.update(schedule_data[1])
339+
if schedule_data[2]:
340+
data["sectionKinds"] = schedule_data[2]
341+
342+
desc_div = table.find_next_sibling("div")
343+
assert desc_div is not None, f"Missing description block for {course}"
344+
data["description"] = desc_div.get_text(" ", strip=True)
345+
346+
return {course: data}
347+
348+
349+
def run() -> dict[str, dict[str, Any]]:
350+
"""
351+
The main entry point.
352+
353+
Args: none
354+
355+
Returns:
356+
* dict[str, dict[str, Any]]: Overrides keyed by subject number.
357+
"""
358+
rows = get_rows()
359+
overrides = {}
360+
for row in rows:
361+
overrides.update(parse_row(row))
362+
return overrides
363+
364+
365+
if __name__ == "__main__":
366+
pprint(run())
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
from bs4 import BeautifulSoup, Tag
2323

24-
from .fireroad import parse_section, parse_timeslot
24+
from scrapers.fireroad import parse_section, parse_timeslot
2525

2626

2727
def parse_when(when: str) -> tuple[str, str]:

0 commit comments

Comments
 (0)