Skip to content

Commit 0465768

Browse files
mini-bombaqamarq
andauthored
refactor(scraper): fix, optimize, refactor (#207)
* refactor(registrationScraper): general cleanup - replaced export {} syntax with marking each function as export at declaration - exported all interfaces - refactored all scrap*() arrow functions to proper functions with proper return typing - added some new interfaces for return types of scrap*() functions - made all scrap*() functions throw improved errors instead of returning undefined * refactor(scraper): major scraper command refactor - split the main `run()` function into multiple task functions - task functions may share data between them using properties of the command object - tasks should batch updates and run them all in a few queries by using the `*Many()` method variants of lucid models - implemented simple async semaphores for ratelimiting - the number of running parallel fetch and DB tasks is limited and can be adjusted using commandline flags - it actually works now (on my machine) * fix(scraper): eliminate duplicate rows using a constraint * refactor(scraper): rewrite the archive task in raw SQL results in a ~40% speedup in that task (~25s -> ~15s) * ci: bump node version to 22 i need that Set.difference in my scraper * fix(migrations): keep the lowest ID duplicate group instead of highest * fix(scraper): reorder group objects after fetching today's session of pointless debugging was brought to you by today's sponsor, adonis! do you want your code to absolutely explode every time you attempt to do a bulk SQL action? do you despise the common-sense assumptions, such as the bulk fetch function returning items in the same order as in the list you provided? do you like wasting hours sitting in the debugger, inventing new debugging techniques, such as setting a conditional breakpoint on `Math.random() < 0.001`? then adonis is perfect for you! rewrite your web project in adonis today! use promo code `mini_bomba` to get 50% more pointless debugging for your first rewrite and a free database implosion on your first tests in production! * fix(frontend): fix connection to api on dev mode * feat(scraper): vacuum & analyze tables after scrape * refactor(scraper): minor code cleanup as suggested in code review - replace () => {return {...};} with () => ({...}) - remove commented-out code - move utils to their own files in /app/utils - create '#utils/' subpath imports for /app/utils --------- Co-authored-by: QamarQ <kamilm@you2.pl>
1 parent b0cfb3d commit 0465768

File tree

9 files changed

+613
-265
lines changed

9 files changed

+613
-265
lines changed

backend/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM node:20.12.2-alpine3.18 AS base
1+
FROM node:22.14.0-alpine3.21 AS base
22

33
# All deps stage
44
FROM base AS deps

backend/app/scrap-registrations/scrap_registrations.ts

Lines changed: 70 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import * as cheerio from "cheerio";
22

3-
import logger from "@adonisjs/core/services/logger";
4-
5-
interface GroupDetails {
3+
export interface ScrapedGroupSummary {
64
name: string;
75
type: string;
86
group: string;
@@ -12,27 +10,27 @@ interface GroupDetails {
1210
endTime: string;
1311
lecturer: string;
1412
}
15-
interface Group {
13+
export interface ScrapedGroup {
1614
url: string;
17-
groups: GroupDetails[];
15+
groups: ScrapedGroupSummary[];
1816
}
1917

20-
interface Course {
18+
export interface ScrapedCourse {
2119
name: string;
2220
courseCode: string;
2321
url: string;
24-
groups: Group[];
22+
groups: ScrapedGroup[];
2523
}
26-
interface Registration {
24+
export interface ScrapedRegistration {
2725
name: string;
2826
url: string;
29-
courses: Course[];
27+
courses: ScrapedCourse[];
3028
}
3129

32-
interface Department {
30+
export interface ScrapedDepartment {
3331
name: string;
3432
url: string;
35-
registrations: Registration[];
33+
registrations: ScrapedRegistration[];
3634
}
3735

3836
const DEPARTMENTS_URL =
@@ -57,12 +55,13 @@ async function fetchData(url: string, options = {}, timeout = 10000) {
5755
}
5856
}
5957

60-
const scrapDepartments = async () => {
61-
const departments: Department[] = [];
58+
export async function scrapDepartments(): Promise<ScrapedDepartment[]> {
59+
const departments: ScrapedDepartment[] = [];
6260
const response = await fetchData(DEPARTMENTS_URL);
6361
if (!response.ok) {
64-
logger.info("Something went wrong in fetching departments");
65-
return;
62+
throw new Error(
63+
`Got response code ${response.status} ${response.statusText} while fetching departments`,
64+
);
6665
}
6766
const body = await response.text();
6867
const $ = cheerio.load(body);
@@ -71,7 +70,7 @@ const scrapDepartments = async () => {
7170
.find(".autostrong")
7271
.children("tr");
7372
departmentsBlock.each((_, element) => {
74-
const newDepartment: Department = {
73+
const newDepartment: ScrapedDepartment = {
7574
name: "",
7675
url: "",
7776
registrations: [],
@@ -87,15 +86,18 @@ const scrapDepartments = async () => {
8786
departments.push(newDepartment);
8887
});
8988
return departments;
90-
};
89+
}
9190

92-
const scrapRegistrations = async (departmentUrl: string) => {
91+
export async function scrapRegistrations(
92+
departmentUrl: string,
93+
): Promise<ScrapedRegistration[]> {
9394
const registrationsNames: string[] = [];
9495
const registrationsUrls: string[] = [];
9596
const response = await fetchData(departmentUrl);
9697
if (!response.ok) {
97-
logger.info("Something went wrong in fetching registrations");
98-
return;
98+
throw new Error(
99+
`Got reponse code ${response.status} ${response.statusText} while fetching registrations`,
100+
);
99101
}
100102
const body = await response.text();
101103
const $ = cheerio.load(body);
@@ -115,16 +117,21 @@ const scrapRegistrations = async (departmentUrl: string) => {
115117
}
116118
});
117119
return registrationsNames.map((name, index) => {
118-
return { name, url: registrationsUrls[index], courses: [] as Course[] };
120+
return {
121+
name,
122+
url: registrationsUrls[index],
123+
courses: [],
124+
};
119125
});
120-
};
126+
}
121127

122-
const scrapCourses = async (registrationUrl: string) => {
128+
export async function scrapCourses(registrationUrl: string): Promise<string[]> {
123129
const coursesUrls: string[] = [];
124130
const response = await fetchData(registrationUrl);
125131
if (!response.ok) {
126-
logger.info("Something went wrong in fetching courses");
127-
return;
132+
throw new Error(
133+
`Got response code ${response.status} ${response.statusText} while fetching courses`,
134+
);
128135
}
129136

130137
const body = await response.text();
@@ -142,16 +149,25 @@ const scrapCourses = async (registrationUrl: string) => {
142149
}
143150
});
144151
return coursesUrls;
145-
};
152+
}
153+
154+
export interface ScrapedCourseDetails {
155+
courseName: string;
156+
courseCode: string;
157+
urls: string[];
158+
}
146159

147-
const scrapCourseNameGroupsUrls = async (courseUrl: string) => {
160+
export async function scrapCourseNameGroupsUrls(
161+
courseUrl: string,
162+
): Promise<ScrapedCourseDetails> {
148163
let courseName = "";
149164
const urls: string[] = [];
150165
let courseCode = "";
151166
const response = await fetchData(courseUrl);
152167
if (!response.ok) {
153-
logger.info("Something went wrong in fetching groups");
154-
return;
168+
throw new Error(
169+
`Got response code ${response.status} ${response.statusText} while fetching course details`,
170+
);
155171
}
156172

157173
const body = await response.text();
@@ -186,14 +202,15 @@ const scrapCourseNameGroupsUrls = async (courseUrl: string) => {
186202
}
187203
});
188204
return { courseName, urls, courseCode };
189-
};
205+
}
190206

191-
const scrapGroupsUrls = async (groupUrl: string) => {
207+
export async function scrapGroupsUrls(groupUrl: string): Promise<string[]> {
192208
const groupsUrls: string[] = [];
193209
const response = await fetchData(groupUrl);
194210
if (!response.ok) {
195-
logger.info("Something went wrong in fetching groups");
196-
return;
211+
throw new Error(
212+
`Got response code ${response.status} ${response.statusText} while fetching group URLs`,
213+
);
197214
}
198215

199216
const body = await response.text();
@@ -207,13 +224,28 @@ const scrapGroupsUrls = async (groupUrl: string) => {
207224
}
208225
});
209226
return groupsUrls;
210-
};
227+
}
211228

212-
const scrapGroupDetails = async (groupUrl: string) => {
229+
export interface ScrapedGroupDetails {
230+
name: string;
231+
type: string;
232+
group: string;
233+
week: string;
234+
days: string[];
235+
startTimeEndTimes: { startTime: string; endTime: string }[];
236+
lecturer: string;
237+
spotsOccupied: number;
238+
spotsTotal: number;
239+
}
240+
241+
export async function scrapGroupDetails(
242+
groupUrl: string,
243+
): Promise<ScrapedGroupDetails> {
213244
const response = await fetchData(groupUrl);
214245
if (!response.ok) {
215-
logger.info("Something went wrong in fetching groups");
216-
return;
246+
throw new Error(
247+
`Got response code ${response.status} ${response.statusText} while fetching group details`,
248+
);
217249
}
218250

219251
const body = await response.text();
@@ -296,7 +328,7 @@ const scrapGroupDetails = async (groupUrl: string) => {
296328
spotsOccupied: Number.isNaN(spotsOccupiedNumber) ? 0 : spotsOccupiedNumber,
297329
spotsTotal: Number.isNaN(spotsTotalNumber) ? 0 : spotsTotalNumber,
298330
};
299-
};
331+
}
300332

301333
const getStartEndTime = (time: string) => {
302334
if (time.includes("brak danych")) {
@@ -365,12 +397,3 @@ const checkDay = (day: string) => {
365397
}
366398
return "unknown";
367399
};
368-
369-
export {
370-
scrapDepartments,
371-
scrapRegistrations,
372-
scrapCourses,
373-
scrapCourseNameGroupsUrls,
374-
scrapGroupsUrls,
375-
scrapGroupDetails,
376-
};

backend/app/utils/arrays.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
export function chunkArray<T>(array: T[], chunkSize: number): T[][] {
2+
const result = [];
3+
const input = Array.from(array);
4+
while (input.length > 0) {
5+
result.push(input.splice(0, chunkSize));
6+
}
7+
return result;
8+
}
9+
10+
export function zip<T1, T2>(a1: T1[], a2: T2[]): [T1, T2][] {
11+
const array1 = Array.from(a1);
12+
const array2 = Array.from(a2);
13+
const result: [T1, T2][] = [];
14+
while (array1.length > 0 && array2.length > 0) {
15+
const el1 = array1.shift() as T1;
16+
const el2 = array2.shift() as T2;
17+
result.push([el1, el2]);
18+
}
19+
return result;
20+
}

backend/app/utils/semaphore.ts

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
export class Semaphore {
2+
capacity: number;
3+
#currentTasks: number;
4+
#waitingTasks: (() => void)[];
5+
6+
constructor(capacity: number) {
7+
this.capacity = capacity;
8+
this.#currentTasks = 0;
9+
this.#waitingTasks = [];
10+
}
11+
12+
public get currentTasks(): number {
13+
return this.#currentTasks;
14+
}
15+
16+
public async runTask<T>(task: () => Promise<T>): Promise<T> {
17+
// acquire the semaphore
18+
await this.acquire();
19+
try {
20+
// execute the task
21+
return await task();
22+
} finally {
23+
// don't forget to release
24+
this.release();
25+
}
26+
}
27+
28+
private acquire(): Promise<void> {
29+
// if we're under capacity, bump the count and resolve immediately
30+
if (this.capacity > this.#currentTasks) {
31+
this.#currentTasks += 1;
32+
return Promise.resolve();
33+
}
34+
// otherwise add ourselves to the queue
35+
return new Promise((resolve) => this.#waitingTasks.push(resolve));
36+
}
37+
38+
private release() {
39+
// try waking up the next task
40+
const nextTask = this.#waitingTasks.shift();
41+
if (nextTask === undefined) {
42+
// no task in queue, decrement task count
43+
this.#currentTasks -= 1;
44+
} else {
45+
// wake up the task
46+
nextTask();
47+
}
48+
}
49+
}

0 commit comments

Comments
 (0)