Skip to content

Commit aaeea9f

Browse files
authored
feat(scraper): including programName in courses (#99)
1 parent 3d6f063 commit aaeea9f

File tree

13 files changed

+283
-8
lines changed

13 files changed

+283
-8
lines changed

apps/scraper/biome.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
"$schema": "https://biomejs.dev/schemas/2.3.5/schema.json",
44
"extends": "//",
55
"files": {
6-
"includes": ["**", "!worker-configuration.d.ts", "!**/__mocks__"]
6+
"includes": [
7+
"**",
8+
"!worker-configuration.d.ts",
9+
"!**/__mocks__",
10+
"!!**/migrations"
11+
]
712
}
813
}

apps/scraper/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
"db:studio:local": "drizzle-kit studio --config=drizzle-dev.config.ts",
1010
"db:studio:remote": "drizzle-kit studio --config=drizzle-prod.config.ts",
1111
"db:generate": "drizzle-kit generate --config=drizzle-dev.config.ts",
12-
"db:migrate:local": "wrangler d1 migrations apply scraper-ops --local",
13-
"db:migrate:remote": "wrangler d1 migrations apply scraper-ops --remote",
12+
"db:migrate:local": "wrangler d1 migrations apply albert-plus-scraper-ops --local",
13+
"db:migrate:remote": "wrangler d1 migrations apply albert-plus-scraper-ops --remote",
1414
"db:push:local": "drizzle-kit push --config=drizzle-dev.config.ts",
1515
"db:push:remote": "drizzle-kit push --config=drizzle-prod.config.ts",
1616
"cf-typegen": "wrangler types --env-interface CloudflareBindings",
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
CREATE TABLE `error_logs` (
2+
`id` text PRIMARY KEY NOT NULL,
3+
`job_id` text,
4+
`error_type` text NOT NULL,
5+
`error_message` text NOT NULL,
6+
`stack_trace` text,
7+
`timestamp` integer NOT NULL,
8+
FOREIGN KEY (`job_id`) REFERENCES `jobs`(`id`) ON UPDATE no action ON DELETE no action
9+
);
10+
--> statement-breakpoint
11+
CREATE TABLE `jobs` (
12+
`id` text PRIMARY KEY NOT NULL,
13+
`url` text NOT NULL,
14+
`status` text DEFAULT 'pending' NOT NULL,
15+
`job_type` text NOT NULL,
16+
`metadata` text,
17+
`created_at` integer NOT NULL,
18+
`started_at` integer,
19+
`completed_at` integer
20+
);
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
{
2+
"version": "6",
3+
"dialect": "sqlite",
4+
"id": "997f5332-898a-4afa-8bb0-2fd3db6abff3",
5+
"prevId": "00000000-0000-0000-0000-000000000000",
6+
"tables": {
7+
"error_logs": {
8+
"name": "error_logs",
9+
"columns": {
10+
"id": {
11+
"name": "id",
12+
"type": "text",
13+
"primaryKey": true,
14+
"notNull": true,
15+
"autoincrement": false
16+
},
17+
"job_id": {
18+
"name": "job_id",
19+
"type": "text",
20+
"primaryKey": false,
21+
"notNull": false,
22+
"autoincrement": false
23+
},
24+
"error_type": {
25+
"name": "error_type",
26+
"type": "text",
27+
"primaryKey": false,
28+
"notNull": true,
29+
"autoincrement": false
30+
},
31+
"error_message": {
32+
"name": "error_message",
33+
"type": "text",
34+
"primaryKey": false,
35+
"notNull": true,
36+
"autoincrement": false
37+
},
38+
"stack_trace": {
39+
"name": "stack_trace",
40+
"type": "text",
41+
"primaryKey": false,
42+
"notNull": false,
43+
"autoincrement": false
44+
},
45+
"timestamp": {
46+
"name": "timestamp",
47+
"type": "integer",
48+
"primaryKey": false,
49+
"notNull": true,
50+
"autoincrement": false
51+
}
52+
},
53+
"indexes": {},
54+
"foreignKeys": {
55+
"error_logs_job_id_jobs_id_fk": {
56+
"name": "error_logs_job_id_jobs_id_fk",
57+
"tableFrom": "error_logs",
58+
"tableTo": "jobs",
59+
"columnsFrom": [
60+
"job_id"
61+
],
62+
"columnsTo": [
63+
"id"
64+
],
65+
"onDelete": "no action",
66+
"onUpdate": "no action"
67+
}
68+
},
69+
"compositePrimaryKeys": {},
70+
"uniqueConstraints": {},
71+
"checkConstraints": {}
72+
},
73+
"jobs": {
74+
"name": "jobs",
75+
"columns": {
76+
"id": {
77+
"name": "id",
78+
"type": "text",
79+
"primaryKey": true,
80+
"notNull": true,
81+
"autoincrement": false
82+
},
83+
"url": {
84+
"name": "url",
85+
"type": "text",
86+
"primaryKey": false,
87+
"notNull": true,
88+
"autoincrement": false
89+
},
90+
"status": {
91+
"name": "status",
92+
"type": "text",
93+
"primaryKey": false,
94+
"notNull": true,
95+
"autoincrement": false,
96+
"default": "'pending'"
97+
},
98+
"job_type": {
99+
"name": "job_type",
100+
"type": "text",
101+
"primaryKey": false,
102+
"notNull": true,
103+
"autoincrement": false
104+
},
105+
"metadata": {
106+
"name": "metadata",
107+
"type": "text",
108+
"primaryKey": false,
109+
"notNull": false,
110+
"autoincrement": false
111+
},
112+
"created_at": {
113+
"name": "created_at",
114+
"type": "integer",
115+
"primaryKey": false,
116+
"notNull": true,
117+
"autoincrement": false
118+
},
119+
"started_at": {
120+
"name": "started_at",
121+
"type": "integer",
122+
"primaryKey": false,
123+
"notNull": false,
124+
"autoincrement": false
125+
},
126+
"completed_at": {
127+
"name": "completed_at",
128+
"type": "integer",
129+
"primaryKey": false,
130+
"notNull": false,
131+
"autoincrement": false
132+
}
133+
},
134+
"indexes": {},
135+
"foreignKeys": {},
136+
"compositePrimaryKeys": {},
137+
"uniqueConstraints": {},
138+
"checkConstraints": {}
139+
}
140+
},
141+
"views": {},
142+
"enums": {},
143+
"_meta": {
144+
"schemas": {},
145+
"tables": {},
146+
"columns": {}
147+
},
148+
"internal": {
149+
"indexes": {}
150+
}
151+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"version": "7",
3+
"dialect": "sqlite",
4+
"entries": [
5+
{
6+
"idx": 0,
7+
"version": "6",
8+
"when": 1763337192931,
9+
"tag": "0000_easy_fat_cobra",
10+
"breakpoints": true
11+
}
12+
]
13+
}

apps/scraper/src/modules/courses/index.test.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,48 @@ describe("Courses Scraper", () => {
5050
}
5151
});
5252

53+
test("should extract program name from page title", async () => {
54+
const mockDb = createMockDb();
55+
const mockEnv = createMockEnv();
56+
57+
const courses = await scrapeCourse(
58+
"https://bulletins.nyu.edu/courses/csci_ua/",
59+
mockDb,
60+
mockEnv,
61+
);
62+
63+
expect(courses.length).toBeGreaterThan(0);
64+
65+
// Verify that programName is extracted from the title
66+
const firstCourse = courses[0];
67+
expect(firstCourse.course).toHaveProperty("programName");
68+
expect(firstCourse.course.programName).toBe("Computer Science");
69+
70+
// Verify that program code is still correct
71+
expect(firstCourse.course.program).toMatch(/CSCI-UA/);
72+
});
73+
74+
test("should extract program name from pages with numeric course codes", async () => {
75+
const mockDb = createMockDb();
76+
const mockEnv = createMockEnv();
77+
78+
const courses = await scrapeCourse(
79+
"https://bulletins.nyu.edu/courses/hrcm1_gc/",
80+
mockDb,
81+
mockEnv,
82+
);
83+
84+
expect(courses.length).toBeGreaterThan(0);
85+
86+
// Verify that programName is extracted correctly for codes with numbers like HRCM1-GC
87+
const firstCourse = courses[0];
88+
expect(firstCourse.course).toHaveProperty("programName");
89+
expect(firstCourse.course.programName).toBe("Human Resources");
90+
91+
// Verify that program code is still correct
92+
expect(firstCourse.course.program).toMatch(/HRCM1-GC/);
93+
});
94+
5395
test("should handle invalid course URLs", async () => {
5496
const mockDb = createMockDb();
5597
const mockEnv = createMockEnv();

apps/scraper/src/modules/courses/index.ts

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ export async function scrapeCourse(
183183
let currentCredits = 0;
184184
let currentDescription = "";
185185
let currentPrereqs = "";
186+
let programName = "";
186187

187188
class CourseBlockHandler {
188189
element(element: Element) {
@@ -194,14 +195,17 @@ export async function scrapeCourse(
194195

195196
element.onEndTag(() => {
196197
if (currentCode && currentTitle) {
197-
const codeMatch = currentCode.match(/([A-Z]+(?:-[A-Z]+)?)\s+(\d+)/);
198+
const codeMatch = currentCode.match(
199+
/([A-Z0-9]+(?:-[A-Z0-9]+)?)\s+(\d+)/,
200+
);
198201
if (codeMatch) {
199202
const [, program, courseNumber] = codeMatch;
200203
const level = getCourseLevel(program, courseNumber);
201204

202205
courses.push({
203206
course: {
204207
program,
208+
programName: programName || "Unknown Program",
205209
code: currentCode,
206210
level,
207211
title: currentTitle,
@@ -259,7 +263,18 @@ export async function scrapeCourse(
259263
}
260264
}
261265

266+
class PageTitleHandler {
267+
text(text: { text: string }) {
268+
const titleText = text.text.trim();
269+
const match = titleText.match(/^([^(]+)\s+\([A-Z0-9-]+\)$/);
270+
if (match && !programName) {
271+
programName = match[1].trim();
272+
}
273+
}
274+
}
275+
262276
const rewriter = new HTMLRewriter()
277+
.on("h1.page-title", new PageTitleHandler())
263278
.on(".courseblock", new CourseBlockHandler())
264279
.on(".detail-code strong", new CodeHandler())
265280
.on(".detail-title strong", new TitleHandler())
@@ -277,8 +292,8 @@ function parsePrerequisites(text: string): CoursePrerequisite[] {
277292

278293
const cleanText = text.replace(/^Prerequisites?:\s*/i, "").trim();
279294

280-
// Match course codes
281-
const coursePattern = /([A-Z]+(?:-[A-Z]+)?)\s+(\d+)/g;
295+
// Match course codes (including codes with numbers like HRCM1-GC)
296+
const coursePattern = /([A-Z0-9]+(?:-[A-Z0-9]+)?)\s+(\d+)/g;
282297
const matches = [...cleanText.matchAll(coursePattern)];
283298

284299
if (matches.length > 0) {

apps/scraper/wrangler.jsonc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
{
3232
"binding": "DB",
3333
"database_name": "albert-plus-scraper-ops",
34-
"database_id": "b96165c7-c7ae-488e-b171-a9ceeef8937c"
34+
"database_id": "b96165c7-c7ae-488e-b171-a9ceeef8937c",
35+
"migrations_dir": "src/drizzle/migrations"
3536
}
3637
],
3738
"vars": {

packages/server/convex/http.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ const normalizeCourseLevel = (
7676

7777
export const ZUpsertCourseWithPrerequisites = z.object({
7878
program: z.string(),
79+
programName: z.string(),
7980
code: z.string(),
8081
level: ZCourseLevel,
8182
title: z.string(),

packages/server/convex/schemas/courses.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { schoolName } from "./schools";
44
const courses = {
55
code: v.string(), // CSCI-UA 101
66
program: v.string(), // CSCI-UA
7+
programName: v.string(), // Computer Science
78
level: v.union(v.literal("undergraduate"), v.literal("graduate")),
89
title: v.string(), // Intro to Computer Science
910
credits: v.number(), // 4

0 commit comments

Comments
 (0)