Skip to content

Commit 9e6620b

Browse files
committed
basic scraper up
1 parent 6ee4243 commit 9e6620b

File tree

3 files changed

+509
-30
lines changed

3 files changed

+509
-30
lines changed

apps/scraper/src/index.ts

Lines changed: 141 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,146 @@
1-
/** biome-ignore-all lint/correctness/noUnusedFunctionParameters: disable for now as they haven't been implemented yet */
21
import { Hono } from "hono";
32

4-
const app = new Hono<{ Bindings: CloudflareBindings }>();
3+
type Bindings = {
4+
SCRAPING_BASE_URL?: string;
5+
};
6+
7+
const clean = (s: string) => s.replace(/\s+/g, " ").replace(/\u00A0/g, " ").trim();
8+
const toNumber = (s: string | null) => {
9+
if (!s) return null;
10+
const m = s.match(/-?\d+(\.\d+)?/);
11+
return m ? Number(m[0]) : null;
12+
};
13+
14+
// titles to ignore (descriptive/non-course lines)
15+
const DESCRIPTIVE_RE = /\b(select (one|two)|choose|introductory courses?|electives?|course list)\b/i;
16+
17+
async function parseCurriculumTable(html: string) {
18+
type Row = { code: string | null; title: string | null; credits: number | null };
19+
20+
let inCurriculum = false;
21+
let inTable = false;
22+
23+
// Per-row buffers
24+
let trClass = "";
25+
let codeBuf = "";
26+
let titleBuf = "";
27+
let creditsBuf = "";
28+
29+
const rows: Row[] = [];
30+
31+
const resetRow = () => {
32+
trClass = "";
33+
codeBuf = "";
34+
titleBuf = "";
35+
creditsBuf = "";
36+
};
37+
38+
const commitRow = () => {
39+
// Skip headers/area headers/subheaders quickly
40+
if (/\b(areaheader|areasubheader|listsum|plangridsum|plangridtotal)\b/i.test(trClass)) return;
41+
42+
const code = clean(codeBuf) || null;
43+
const title = clean(titleBuf) || null;
44+
const credits = toNumber(clean(creditsBuf));
545

6-
app.get("/", async (c) => {
7-
// const db = await createDB(c.env);
8-
// TODO: use hono to render a dashboard to monitor the scraping status
46+
// Ignore clearly descriptive lines (“Select one of the following”, “Introductory Courses”, etc.)
47+
if (!code && title && DESCRIPTIVE_RE.test(title)) return;
48+
49+
// Keep if at least two of the three fields are present
50+
const have = [!!code, !!title, credits !== null].filter(Boolean).length;
51+
if (have >= 2) {
52+
rows.push({ code, title, credits });
53+
}
54+
};
55+
56+
const rewriter = new HTMLRewriter()
57+
// Scope to the Curriculum tab content
58+
.on("#curriculumtextcontainer", {
59+
element(e) {
60+
inCurriculum = true;
61+
e.onEndTag(() => {
62+
inCurriculum = false;
63+
inTable = false;
64+
});
65+
},
66+
})
67+
// Only the course list table inside Curriculum
68+
.on("#curriculumtextcontainer table.sc_courselist", {
69+
element(e) {
70+
if (!inCurriculum) return;
71+
inTable = true;
72+
e.onEndTag(() => {
73+
inTable = false;
74+
});
75+
},
76+
})
77+
// Track each row
78+
.on("#curriculumtextcontainer table.sc_courselist tbody tr", {
79+
element(e) {
80+
if (!inTable) return;
81+
resetRow();
82+
trClass = e.getAttribute("class") || "";
83+
e.onEndTag(() => {
84+
if (!inTable) return;
85+
commitRow();
86+
});
87+
},
88+
})
89+
// codecol -> course number
90+
.on("#curriculumtextcontainer table.sc_courselist tbody tr td.codecol", {
91+
text(t) {
92+
if (!inTable) return;
93+
codeBuf += t.text;
94+
},
95+
})
96+
// hourscol -> credits
97+
.on("#curriculumtextcontainer table.sc_courselist tbody tr td.hourscol", {
98+
text(t) {
99+
if (!inTable) return;
100+
creditsBuf += t.text;
101+
},
102+
})
103+
// any other cell (including span.courselistcomment) -> title/description
104+
// any other cell (not codecol or hourscol) -> title/description
105+
.on("#curriculumtextcontainer table.sc_courselist tbody tr td:not(.codecol):not(.hourscol)", {
106+
text(t) {
107+
if (!inTable) return;
108+
titleBuf += t.text;
109+
},
110+
})
111+
await rewriter.transform(new Response(html)).text();
112+
return rows;
113+
}
114+
115+
const app = new Hono<{ Bindings: Bindings }>();
116+
117+
app.get("/", c => c.text("ok"));
118+
119+
// GET /scrape?url=...
120+
app.get("/scrape", async c => {
121+
const url = c.req.query("url") || c.env.SCRAPING_BASE_URL;
122+
if (!url) return c.json({ error: "No URL provided" }, 400);
123+
124+
const res = await fetch(url, {
125+
headers: {
126+
"user-agent": "CF-Worker Scraper/1.0 (+https://developers.cloudflare.com/workers/)",
127+
},
128+
cf: { cacheEverything: false },
129+
});
130+
const html = await res.text();
131+
132+
// Program title = <h1 class="page-title">...</h1>
133+
const h1 = html.match(/<h1[^>]*class=["'][^"']*page-title[^"']*["'][^>]*>([\s\S]*?)<\/h1>/i);
134+
const programTitle = h1 ? clean(h1[1].replace(/<[^>]+>/g, "")) : null;
135+
136+
const items = await parseCurriculumTable(html);
137+
138+
return c.json({
139+
url,
140+
status: res.status,
141+
programTitle,
142+
items, // flattened list of kept rows: { code|null, title|null, credits|null }
143+
});
9144
});
10145

11-
export default {
12-
fetch: app.fetch,
13-
14-
async scheduled(event: ScheduledEvent, env: CloudflareBindings) {
15-
// const db = await createDB(env);
16-
// const api = new ConvexApi({
17-
// baseUrl: env.CONVEX_SITE_URL,
18-
// apiKey: env.CONVEX_API_KEY,
19-
// });
20-
// TODO: set up jobs for scraping a list of urls need to be scraped and add them to queue as "discovery"
21-
},
22-
23-
async queue(batch: MessageBatch<Error>, env: CloudflareBindings) {
24-
// const db = await createDB(env);
25-
// const api = new ConvexApi({
26-
// baseUrl: env.CONVEX_SITE_URL,
27-
// apiKey: env.CONVEX_API_KEY,
28-
// });
29-
// TODO: set up jobs for scrping given url and save structured data to convex database
30-
},
31-
};
146+
export default { fetch: app.fetch };

bun.lock

Lines changed: 180 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)