TechAtNYU
diff --git a/‎apps/docs/src/content/docs/architecture/data-flow.md‎
Lines changed: 19 additions & 3 deletions b/‎apps/docs/src/content/docs/architecture/data-flow.md‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎apps/docs/src/content/docs/architecture/overview.md‎
Lines changed: 3 additions & 1 deletion b/‎apps/docs/src/content/docs/architecture/overview.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎apps/docs/src/content/docs/getting-started/environment-variables.md‎
Lines changed: 20 additions & 12 deletions b/‎apps/docs/src/content/docs/getting-started/environment-variables.md‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎apps/docs/src/content/docs/modules/convex.md‎
Lines changed: 13 additions & 0 deletions b/‎apps/docs/src/content/docs/modules/convex.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎apps/docs/src/content/docs/modules/scraper.md‎
Lines changed: 36 additions & 1 deletion b/‎apps/docs/src/content/docs/modules/scraper.md‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎apps/scraper/src/drizzle/schema.ts‎
Lines changed: 9 additions & 1 deletion b/‎apps/scraper/src/drizzle/schema.ts‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎apps/scraper/src/index.ts‎
Lines changed: 119 additions & 8 deletions b/‎apps/scraper/src/index.ts‎
Lines changed: 119 additions & 8 deletions
diff --git a/‎apps/scraper/src/lib/convex.ts‎
Lines changed: 3 additions & 3 deletions b/‎apps/scraper/src/lib/convex.ts‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎apps/scraper/src/modules/courseOfferings/index.ts‎
Lines changed: 26 additions & 0 deletions b/‎apps/scraper/src/modules/courseOfferings/index.ts‎
Lines changed: 26 additions & 0 deletions
@@ -8,6 +8,8 @@ Understanding the flow of data is crucial to comprehending how AlbertPlus works.
 
 The primary data pipeline is responsible for collecting, storing, and serving course and program information.
 
+### Static Course & Program Data (Manual Trigger)
+
 1. **Scraping (Cloudflare Worker)**
    - **Admin Trigger**: Admin users initiate scraping by calling Convex actions (`api.scraper.triggerMajorsScraping` or `api.scraper.triggerCoursesScraping`).
    - **Authenticated Request**: The Convex action makes a POST request to the scraper's HTTP endpoints (`/api/trigger-majors` or `/api/trigger-courses`) with the `CONVEX_API_KEY` in the `X-API-KEY` header.
@@ -17,14 +19,28 @@ The primary data pipeline is responsible for collecting, storing, and serving co
    - **Data Extraction**: Each job in the queue is processed by the worker, which scrapes the detailed information for a specific course or program.
    - **Upsert to Backend**: The scraped data is sent back to the Convex backend via authenticated HTTP endpoints.
 
+### Dynamic Course Offerings Data (Scheduled)
+
+1. **Automated Scraping (Cloudflare Worker Cronjob)**
+   - **Scheduled Trigger**: A cronjob runs at regular intervals (configured in `wrangler.jsonc`).
+   - **Config Check**: The worker reads app configuration from Convex to determine which terms to scrape (`is_scrape_current`, `is_scrape_next`, along with term/year information).
+   - **Albert Public Search**: For each enabled term, the worker scrapes Albert's public class search to discover all course offering URLs.
+   - **Job Queuing**: Each course offering URL is added to the queue as a `course-offering` job with metadata about the term and year.
+   - **Section Details**: Each job scrapes detailed information including:
+     - Class number, section, and status (open/closed/waitlist)
+     - Instructor names and location
+     - Meeting days, start time, and end time
+     - Corequisite relationships
+   - **Batch Upsert**: Scraped course offerings are sent to Convex in batches via the `/api/courseOfferings/upsert` endpoint.
+
 2. **Backend Processing (Convex)**
    - **Data Reception**: The Convex backend receives the scraped data from the Cloudflare Worker.
-   - **Database Storage**: The data is upserted into the Convex database, ensuring that existing records are updated and new ones are created. This includes courses, programs, requirements, and prerequisites.
+   - **Database Storage**: The data is upserted into the Convex database, ensuring that existing records are updated and new ones are created. This includes courses, programs, requirements, prerequisites, and course offerings.
    - **Real-time Updates**: Any clients connected to the Convex backend (such as the web app) will receive real-time updates as the new data is written to the database.
 
 3. **Client-side Consumption (Web App & Browser Extension)**
-   - **Data Fetching**: The Next.js web app and the browser extension query the Convex backend to fetch course and program data.
-   - **User Interface**: The data is then rendered in the user interface, allowing students to browse the course catalog, view program requirements, and build their schedules.
+   - **Data Fetching**: The Next.js web app and the browser extension query the Convex backend to fetch course, program, and course offering data.
+   - **User Interface**: The data is then rendered in the user interface, allowing students to browse the course catalog, view program requirements, check real-time class availability, and build their schedules.
 
 ## Degree Progress Report Parsing
 
 
@@ -10,7 +10,9 @@ The AlbertPlus ecosystem is composed of several distinct yet interconnected appl
 
 - **Web Application**: A feature-rich Next.js application that serves as the primary user interface for course planning and schedule building.
 - **Browser Extension**: A Chrome extension built with Plasmo that integrates with the native Albert website, providing a seamless user experience.
-- **Web Scraper**: A Cloudflare Worker that periodically scrapes course data from NYU's public-facing systems to ensure the information in AlbertPlus is always up-to-date.
+- **Web Scraper**: A Cloudflare Worker that scrapes course data from NYU's public-facing systems, including:
+  - **Manual Scraping**: Admin-triggered scraping of static course catalog and program data from NYU bulletins
+  - **Automated Scraping**: Scheduled cronjob that scrapes real-time course offerings (sections, availability, schedules) from Albert public search
 - **Serverless Backend**: A Convex-powered backend that provides a real-time database, serverless functions, and authentication services.
 - **Documentation Site**: An Astro and Starlight-based website that you are currently viewing, which serves as the central hub for all project documentation.
 
 
@@ -27,29 +27,37 @@ These variables are needed for the Chrome browser extension.
 
 ## Scraper (`apps/scraper`)
 
-These variables are required for the Cloudflare Worker scraper.
+These environment variables are required for the Cloudflare Worker scraper.
 
-| Variable            | Description                                                                     |
-| ------------------- | ------------------------------------------------------------------------------- |
-| `CONVEX_SITE_URL`   | The HTTP API URL for your Convex backend.                                       |
-| `CONVEX_API_KEY`    | An API key for authenticating with the Convex backend.                          |
-| `SCRAPING_BASE_URL` | The base URL for the NYU course bulletins (e.g., `https://bulletins.nyu.edu/`). |
+| Variable          | Description                                            |
+| ----------------- | ------------------------------------------------------ |
+| `CONVEX_SITE_URL` | The HTTP API URL for your Convex backend.              |
+| `CONVEX_API_KEY`  | An API key for authenticating with the Convex backend. |
 
 ## Convex Backend (`packages/server`)
 
 These variables are configured in your Convex deployment environment.
 
-| Variable                  | Description                                                                        |
-| ------------------------- | ---------------------------------------------------------------------------------- |
-| `CLERK_JWT_ISSUER_DOMAIN` | The JWT issuer domain from your Clerk account for token validation.                |
+| Variable                  | Description                                                                         |
+| ------------------------- | ----------------------------------------------------------------------------------- |
+| `CLERK_JWT_ISSUER_DOMAIN` | The JWT issuer domain from your Clerk account for token validation.                 |
 | `CONVEX_API_KEY`          | A shared API key for authenticating requests between Convex and the scraper worker. |
-| `SCRAPER_URL`             | The URL of the deployed scraper worker (e.g., `https://scraper.albertplus.com`).   |
+| `SCRAPER_URL`             | The URL of the deployed scraper worker (e.g., `https://scraper.albertplus.com`).    |
 
-## Cloudflare Worker Bindings
+## Cloudflare Worker Configuration
 
-These are not environment variables in the traditional sense, but rather bindings configured in the `wrangler.jsonc` file.
+These are configured in `wrangler.jsonc`.
+
+### Bindings
 
 | Binding          | Type        | Description                              |
 | ---------------- | ----------- | ---------------------------------------- |
 | `SCRAPING_QUEUE` | Queue       | Binding for the Cloudflare Worker queue. |
 | `DB`             | D1 Database | Binding for the Cloudflare D1 database.  |
+
+### Variables
+
+| Variable                   | Description                                                                                    |
+| -------------------------- | ---------------------------------------------------------------------------------------------- |
+| `SCRAPING_BASE_URL`        | The base URL for NYU course bulletins (e.g., `https://bulletins.nyu.edu/`).                    |
+| `ALBERT_SCRAPING_BASE_URL` | The base URL for Albert public class search (e.g., `https://bulletins.nyu.edu/class-search/`). |
@@ -35,3 +35,16 @@ bun run dashboard
 | `userCourseOfferings` | Links users to the specific course offerings they have added to their schedule. |
 | `students`            | Stores student-specific information, linked to a Clerk user ID.                 |
 | `schools`             | A list of the different schools within NYU.                                     |
+
+## App Configuration Keys
+
+The `appConfigs` table stores various configuration settings that control scraper behavior and term information:
+
+| Key | Type | Description |
+|-----|------|-------------|
+| `current_term` | `"spring" \| "summer" \| "fall" \| "j-term"` | The current academic term |
+| `current_year` | `string` | The current academic year (e.g., `"2025"`) |
+| `next_term` | `"spring" \| "summer" \| "fall" \| "j-term"` | The next academic term |
+| `next_year` | `string` | The next academic year |
+| `is_scrape_current` | `"true" \| "false"` | Flag to enable/disable scraping of current term course offerings |
+| `is_scrape_next` | `"true" \| "false"` | Flag to enable/disable scraping of next term course offerings |
@@ -16,6 +16,8 @@ The scraper, located in the `apps/scraper` directory, is a critical component of
 
 The scraping process is designed to be robust and resilient:
 
+### Manual Scraping (Programs & Courses)
+
 1. **Admin Trigger**: Admin users can trigger scraping through the Convex backend by calling dedicated actions:
    - `api.scraper.triggerMajorsScraping` - Initiates major (program) discovery
    - `api.scraper.triggerCoursesScraping` - Initiates course discovery
@@ -29,11 +31,44 @@ The scraping process is designed to be robust and resilient:
 7. **Data Upsert**: The scraped data is then sent to the Convex backend via authenticated HTTP requests to be stored in the main database.
 8. **Error Handling**: The system includes error logging and a retry mechanism for failed jobs.
 
+### Automated Scraping (Course Offerings)
+
+Course offerings (class sections with schedule details) are scraped automatically via a scheduled Cloudflare Worker cronjob:
+
+1. **Scheduled Trigger**: The worker runs on a schedule defined in `wrangler.jsonc` to check for new course offerings.
+2. **App Config Check**: The worker reads the following configuration from Convex:
+   - `is_scrape_current` - Boolean flag to enable scraping current term
+   - `is_scrape_next` - Boolean flag to enable scraping next term
+   - `current_term` / `current_year` - Identifies the current academic term
+   - `next_term` / `next_year` - Identifies the next academic term
+3. **Discovery Jobs**: For each enabled term, the worker creates a `discover-course-offerings` job that scrapes Albert's public search to find all course offering URLs.
+4. **Individual Jobs**: Each discovered course offering URL becomes a `course-offering` job in the queue.
+5. **Data Processing**: The worker scrapes details such as class number, section, instructor, schedule, location, and enrollment status.
+6. **Backend Sync**: Scraped course offerings are sent to Convex via the `/api/courseOfferings/upsert` endpoint in batches.
+
+## Job Types
+
+The scraper supports the following job types, tracked in the D1 database:
+
+| Job Type | Description |
+|----------|-------------|
+| `discover-programs` | Discovers all program URLs from the bulletin |
+| `discover-courses` | Discovers all course URLs from the bulletin |
+| `discover-course-offerings` | Discovers course offering URLs from Albert public search for a specific term/year |
+| `program` | Scrapes detailed data for a single program |
+| `course` | Scrapes detailed data for a single course |
+| `course-offering` | Scrapes detailed data for a single course offering (section) |
+
+Jobs can include metadata (stored as JSON) to pass contextual information such as the academic term and year.
+
 ## Project Structure
 
 The scraper's code is organized as follows:
 
 - `src/index.ts`: The main entry point for the Cloudflare Worker, including the scheduled and queue handlers.
 - `src/drizzle/`: The Drizzle ORM schema and database connection setup.
 - `src/lib/`: Core libraries for interacting with Convex and managing the job queue.
-- `src/modules/`: The logic for discovering and scraping courses and programs.
+- `src/modules/`: The logic for discovering and scraping courses, programs, and course offerings.
+  - `programs/`: Program discovery and scraping logic
+  - `courses/`: Course discovery and scraping logic
+  - `courseOfferings/`: Course offering discovery and scraping logic (in progress)
@@ -12,8 +12,16 @@ export const jobs = sqliteTable("jobs", {
     .notNull()
     .default("pending"),
   jobType: text("job_type", {
-    enum: ["discover-programs", "discover-courses", "program", "course"],
+    enum: [
+      "discover-programs",
+      "discover-courses",
+      "discover-course-offerings",
+      "program",
+      "course",
+      "course-offering",
+    ],
   }).notNull(),
+  metadata: text("metadata", { mode: "json" }),
   createdAt: integer("created_at", { mode: "timestamp" })
     .notNull()
     .$defaultFn(() => new Date()),
 
@@ -5,6 +5,10 @@ import getDB from "./drizzle";
 import { errorLogs, jobs } from "./drizzle/schema";
 import { ConvexApi } from "./lib/convex";
 import { JobError, type JobMessage } from "./lib/queue";
+import {
+  discoverCourseOfferings,
+  scrapeCourseOfferings,
+} from "./modules/courseOfferings";
 import { discoverCourses, scrapeCourse } from "./modules/courses";
 import { discoverPrograms, scrapeProgram } from "./modules/programs";
 
@@ -80,14 +84,77 @@ app.post("/api/courses", validateApiKey, async (c) => {
 export default {
   fetch: app.fetch,
 
-  async scheduled(_event: ScheduledEvent, _env: CloudflareBindings) {
-    // const db = getDB(env);
-    // const convex = new ConvexApi({
-    //   baseUrl: env.CONVEX_SITE_URL,
-    //   apiKey: env.CONVEX_API_KEY,
-    // });
-    // TODO: add albert public search
-    return;
+  async scheduled(_event: ScheduledEvent, env: CloudflareBindings) {
+    const db = getDB(env);
+    const convex = new ConvexApi({
+      baseUrl: env.CONVEX_SITE_URL,
+      apiKey: env.CONVEX_API_KEY,
+    });
+
+    // Get scraping flags from Convex app config
+    const isScrapeCurrentData = await convex.getAppConfig({
+      key: "is_scrape_current",
+    });
+    const isScrapeNextData = await convex.getAppConfig({
+      key: "is_scrape_next",
+    });
+
+    const isScrapeCurrent = isScrapeCurrentData === "true";
+    const isScrapeNext = isScrapeNextData === "true";
+
+    console.log(
+      `Cronjob: Scraping flags - current: ${isScrapeCurrent}, next: ${isScrapeNext}`,
+    );
+
+    // Collect terms to scrape
+    const termsToScrape: Array<{
+      term: "spring" | "summer" | "fall" | "j-term";
+      year: number;
+    }> = [];
+
+    if (isScrapeCurrent) {
+      const currentTerm = (await convex.getAppConfig({
+        key: "current_term",
+      })) as "spring" | "summer" | "fall" | "j-term";
+      const currentYearStr = await convex.getAppConfig({ key: "current_year" });
+      if (currentYearStr) {
+        const currentYear = Number.parseInt(currentYearStr, 10);
+        termsToScrape.push({ term: currentTerm, year: currentYear });
+      }
+    }
+
+    if (isScrapeNext) {
+      const nextTerm = (await convex.getAppConfig({ key: "next_term" })) as
+        | "spring"
+        | "summer"
+        | "fall"
+        | "j-term";
+      const nextYearStr = await convex.getAppConfig({ key: "next_year" });
+      if (nextYearStr) {
+        const nextYear = Number.parseInt(nextYearStr, 10);
+        termsToScrape.push({ term: nextTerm, year: nextYear });
+      }
+    }
+
+    // Trigger course offerings discovery for each enabled term
+    const courseOfferingsUrl = new URL(env.SCRAPING_BASE_URL).toString();
+
+    for (const { term, year } of termsToScrape) {
+      const [createdJob] = await db
+        .insert(jobs)
+        .values({
+          url: courseOfferingsUrl,
+          jobType: "discover-course-offerings",
+          metadata: { term, year },
+        })
+        .returning();
+
+      await env.SCRAPING_QUEUE.send({ jobId: createdJob.id });
+
+      console.log(
+        `Cronjob: Created course offerings discovery job [id: ${createdJob.id}, term: ${term}, year: ${year}]`,
+      );
+    }
   },
 
   async queue(
@@ -186,6 +253,50 @@ export default {
                 }
                 break;
               }
+              case "discover-course-offerings": {
+                const metadata = job.metadata as {
+                  term: "spring" | "summer" | "fall" | "j-term";
+                  year: number;
+                } | null;
+
+                if (!metadata?.term || !metadata?.year) {
+                  throw new JobError(
+                    "Missing term or year in job metadata",
+                    "validation",
+                  );
+                }
+
+                const courseOfferingUrls = await discoverCourseOfferings(
+                  job.url,
+                  metadata.term,
+                  metadata.year,
+                );
+                const newJobs = await db
+                  .insert(jobs)
+                  .values(
+                    courseOfferingUrls.map((url) => ({
+                      url,
+                      jobType: "course-offering" as const,
+                      metadata: { term: metadata.term, year: metadata.year },
+                    })),
+                  )
+                  .returning();
+
+                await env.SCRAPING_QUEUE.sendBatch(
+                  newJobs.map((j) => ({ body: { jobId: j.id } })),
+                );
+                break;
+              }
+              case "course-offering": {
+                const courseOfferings = await scrapeCourseOfferings(
+                  job.url,
+                  db,
+                  env,
+                );
+
+                await convex.upsertCourseOfferings(courseOfferings);
+                break;
+              }
             }
 
             await db
 
@@ -2,7 +2,7 @@ import type { internal } from "@albert-plus/server/convex/_generated/api";
 import {
   ZGetAppConfig,
   type ZSetAppConfig,
-  ZUpsertCourseOffering,
+  ZUpsertCourseOfferings,
   ZUpsertCourseWithPrerequisites,
   ZUpsertProgramWithRequirements,
 } from "@albert-plus/server/convex/http";
@@ -73,12 +73,12 @@ export class ConvexApi {
     return res.data;
   }
 
-  async upsertCourseOffering(data: z.infer<typeof ZUpsertCourseOffering>) {
+  async upsertCourseOfferings(data: z.infer<typeof ZUpsertCourseOfferings>) {
     const res = await this.request<
       FunctionReturnType<
         typeof internal.courseOfferings.upsertCourseOfferingInternal
       >
-    >("/api/courseOfferings/upsert", ZUpsertCourseOffering, data);
+    >("/api/courseOfferings/upsert", ZUpsertCourseOfferings, data);
     return res.data;
   }
 
 
@@ -0,0 +1,26 @@
+/** biome-ignore-all lint/correctness/noUnusedFunctionParameters: bypass for now */
+import type { ZUpsertCourseOfferings } from "@albert-plus/server/convex/http";
+import type { DrizzleD1Database } from "drizzle-orm/d1";
+import type * as z from "zod/mini";
+
+export type CourseOfferingData = z.infer<typeof ZUpsertCourseOfferings>;
+
+export async function discoverCourseOfferings(
+  url: string,
+  term: "spring" | "summer" | "fall" | "j-term",
+  year: number,
+): Promise<string[]> {
+  // TODO: implement this function to scrape the Albert public search listing
+  // This should extract all course URLs from the search page for the given term/year
+  // Example: returns ["https://albert.../CSCI-UA-101?term=spring&year=2025", ...]
+  return [];
+}
+
+export async function scrapeCourseOfferings(
+  url: string,
+  db: DrizzleD1Database,
+  env: CloudflareBindings,
+): Promise<CourseOfferingData> {
+  // TODO: implement this function to scrape a single course page
+  throw new Error("Not implemented");
+}