|
1 | | -/** biome-ignore-all lint/correctness/noUnusedFunctionParameters: disable for now as they haven't been implemented yet */ |
2 | 1 | import { Hono } from "hono"; |
3 | 2 |
|
4 | | -const app = new Hono<{ Bindings: CloudflareBindings }>(); |
| 3 | +type Bindings = { |
| 4 | + SCRAPING_BASE_URL?: string; |
| 5 | +}; |
| 6 | + |
| 7 | +const clean = (s: string) => s.replace(/\s+/g, " ").replace(/\u00A0/g, " ").trim(); |
| 8 | +const toNumber = (s: string | null) => { |
| 9 | + if (!s) return null; |
| 10 | + const m = s.match(/-?\d+(\.\d+)?/); |
| 11 | + return m ? Number(m[0]) : null; |
| 12 | +}; |
| 13 | + |
| 14 | +// titles to ignore (descriptive/non-course lines) |
| 15 | +const DESCRIPTIVE_RE = /\b(select (one|two)|choose|introductory courses?|electives?|course list)\b/i; |
| 16 | + |
| 17 | +async function parseCurriculumTable(html: string) { |
| 18 | + type Row = { code: string | null; title: string | null; credits: number | null }; |
| 19 | + |
| 20 | + let inCurriculum = false; |
| 21 | + let inTable = false; |
| 22 | + |
| 23 | + // Per-row buffers |
| 24 | + let trClass = ""; |
| 25 | + let codeBuf = ""; |
| 26 | + let titleBuf = ""; |
| 27 | + let creditsBuf = ""; |
| 28 | + |
| 29 | + const rows: Row[] = []; |
| 30 | + |
| 31 | + const resetRow = () => { |
| 32 | + trClass = ""; |
| 33 | + codeBuf = ""; |
| 34 | + titleBuf = ""; |
| 35 | + creditsBuf = ""; |
| 36 | + }; |
| 37 | + |
| 38 | + const commitRow = () => { |
| 39 | + // Skip headers/area headers/subheaders quickly |
| 40 | + if (/\b(areaheader|areasubheader|listsum|plangridsum|plangridtotal)\b/i.test(trClass)) return; |
| 41 | + |
| 42 | + const code = clean(codeBuf) || null; |
| 43 | + const title = clean(titleBuf) || null; |
| 44 | + const credits = toNumber(clean(creditsBuf)); |
5 | 45 |
|
6 | | -app.get("/", async (c) => { |
7 | | - // const db = await createDB(c.env); |
8 | | - // TODO: use hono to render a dashboard to monitor the scraping status |
| 46 | + // Ignore clearly descriptive lines (“Select one of the following”, “Introductory Courses”, etc.) |
| 47 | + if (!code && title && DESCRIPTIVE_RE.test(title)) return; |
| 48 | + |
| 49 | + // Keep if at least two of the three fields are present |
| 50 | + const have = [!!code, !!title, credits !== null].filter(Boolean).length; |
| 51 | + if (have >= 2) { |
| 52 | + rows.push({ code, title, credits }); |
| 53 | + } |
| 54 | + }; |
| 55 | + |
| 56 | + const rewriter = new HTMLRewriter() |
| 57 | + // Scope to the Curriculum tab content |
| 58 | + .on("#curriculumtextcontainer", { |
| 59 | + element(e) { |
| 60 | + inCurriculum = true; |
| 61 | + e.onEndTag(() => { |
| 62 | + inCurriculum = false; |
| 63 | + inTable = false; |
| 64 | + }); |
| 65 | + }, |
| 66 | + }) |
| 67 | + // Only the course list table inside Curriculum |
| 68 | + .on("#curriculumtextcontainer table.sc_courselist", { |
| 69 | + element(e) { |
| 70 | + if (!inCurriculum) return; |
| 71 | + inTable = true; |
| 72 | + e.onEndTag(() => { |
| 73 | + inTable = false; |
| 74 | + }); |
| 75 | + }, |
| 76 | + }) |
| 77 | + // Track each row |
| 78 | + .on("#curriculumtextcontainer table.sc_courselist tbody tr", { |
| 79 | + element(e) { |
| 80 | + if (!inTable) return; |
| 81 | + resetRow(); |
| 82 | + trClass = e.getAttribute("class") || ""; |
| 83 | + e.onEndTag(() => { |
| 84 | + if (!inTable) return; |
| 85 | + commitRow(); |
| 86 | + }); |
| 87 | + }, |
| 88 | + }) |
| 89 | + // codecol -> course number |
| 90 | + .on("#curriculumtextcontainer table.sc_courselist tbody tr td.codecol", { |
| 91 | + text(t) { |
| 92 | + if (!inTable) return; |
| 93 | + codeBuf += t.text; |
| 94 | + }, |
| 95 | + }) |
| 96 | + // hourscol -> credits |
| 97 | + .on("#curriculumtextcontainer table.sc_courselist tbody tr td.hourscol", { |
| 98 | + text(t) { |
| 99 | + if (!inTable) return; |
| 100 | + creditsBuf += t.text; |
| 101 | + }, |
| 102 | + }) |
| 103 | + // any other cell (including span.courselistcomment) -> title/description |
| 104 | + // any other cell (not codecol or hourscol) -> title/description |
| 105 | + .on("#curriculumtextcontainer table.sc_courselist tbody tr td:not(.codecol):not(.hourscol)", { |
| 106 | + text(t) { |
| 107 | + if (!inTable) return; |
| 108 | + titleBuf += t.text; |
| 109 | + }, |
| 110 | + }) |
| 111 | + await rewriter.transform(new Response(html)).text(); |
| 112 | + return rows; |
| 113 | +} |
| 114 | + |
| 115 | +const app = new Hono<{ Bindings: Bindings }>(); |
| 116 | + |
| 117 | +app.get("/", c => c.text("ok")); |
| 118 | + |
| 119 | +// GET /scrape?url=... |
| 120 | +app.get("/scrape", async c => { |
| 121 | + const url = c.req.query("url") || c.env.SCRAPING_BASE_URL; |
| 122 | + if (!url) return c.json({ error: "No URL provided" }, 400); |
| 123 | + |
| 124 | + const res = await fetch(url, { |
| 125 | + headers: { |
| 126 | + "user-agent": "CF-Worker Scraper/1.0 (+https://developers.cloudflare.com/workers/)", |
| 127 | + }, |
| 128 | + cf: { cacheEverything: false }, |
| 129 | + }); |
| 130 | + const html = await res.text(); |
| 131 | + |
| 132 | + // Program title = <h1 class="page-title">...</h1> |
| 133 | + const h1 = html.match(/<h1[^>]*class=["'][^"']*page-title[^"']*["'][^>]*>([\s\S]*?)<\/h1>/i); |
| 134 | + const programTitle = h1 ? clean(h1[1].replace(/<[^>]+>/g, "")) : null; |
| 135 | + |
| 136 | + const items = await parseCurriculumTable(html); |
| 137 | + |
| 138 | + return c.json({ |
| 139 | + url, |
| 140 | + status: res.status, |
| 141 | + programTitle, |
| 142 | + items, // flattened list of kept rows: { code|null, title|null, credits|null } |
| 143 | + }); |
9 | 144 | }); |
10 | 145 |
|
11 | | -export default { |
12 | | - fetch: app.fetch, |
13 | | - |
14 | | - async scheduled(event: ScheduledEvent, env: CloudflareBindings) { |
15 | | - // const db = await createDB(env); |
16 | | - // const api = new ConvexApi({ |
17 | | - // baseUrl: env.CONVEX_SITE_URL, |
18 | | - // apiKey: env.CONVEX_API_KEY, |
19 | | - // }); |
20 | | - // TODO: set up jobs for scraping a list of urls need to be scraped and add them to queue as "discovery" |
21 | | - }, |
22 | | - |
23 | | - async queue(batch: MessageBatch<Error>, env: CloudflareBindings) { |
24 | | - // const db = await createDB(env); |
25 | | - // const api = new ConvexApi({ |
26 | | - // baseUrl: env.CONVEX_SITE_URL, |
27 | | - // apiKey: env.CONVEX_API_KEY, |
28 | | - // }); |
29 | | - // TODO: set up jobs for scrping given url and save structured data to convex database |
30 | | - }, |
31 | | -}; |
| 146 | +export default { fetch: app.fetch }; |
0 commit comments