Skip to content

Commit c88956b

Browse files
committed
fix(scraper): Eliminate duplicate rows using a constraint
1 parent 8721395 commit c88956b

File tree

2 files changed

+130
-53
lines changed

2 files changed

+130
-53
lines changed

backend/commands/scraper.ts

Lines changed: 91 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import { TaskCallback } from "@poppinss/cliui/types";
2+
import { DateTime } from "luxon";
23
import assert from "node:assert";
34

45
import { BaseCommand, flags } from "@adonisjs/core/ace";
56
import type { CommandOptions } from "@adonisjs/core/types/ace";
7+
import db from "@adonisjs/lucid/services/db";
68

79
import Course from "#models/course";
810
import Department from "#models/department";
@@ -388,72 +390,108 @@ export default class Scraper extends BaseCommand {
388390
const lecturerMap = new Map(zip(lecturerSet, lecturersIds));
389391

390392
task.update("Updating groups");
393+
const currentDate = DateTime.now();
391394
// set all groups to inactive, query below will activate scraped ones
392395
await Group.query().update({ isActive: false });
393-
const groupQueue = fetchedDetails.flatMap(
394-
({ url, registration, course, details }) =>
396+
const preparedGroups = fetchedDetails.flatMap(
397+
({ url, registration, course, details, lecturers }) =>
395398
details.days.map((day) => {
396399
return {
397-
name: details.name.slice(0, 255),
398-
startTime: details.startTimeEndTimes[
399-
details.days.indexOf(day)
400-
].startTime.slice(0, 255),
401-
endTime: details.startTimeEndTimes[
402-
details.days.indexOf(day)
403-
].endTime.slice(0, 255),
404-
group: details.group.slice(0, 255),
405-
week: details.week as "-" | "TP" | "TN",
406-
day: day.slice(0, 255),
407-
type: details.type.slice(0, 255),
408-
courseId:
409-
course.courseCode.slice(0, 255) +
410-
(extractLastStringInBrackets(registration.name) ??
411-
registration.name),
412-
spotsOccupied: details.spotsOccupied,
413-
spotsTotal: details.spotsTotal,
414-
url: url.slice(0, 255),
415-
isActive: true,
400+
row: {
401+
name: details.name.slice(0, 255),
402+
start_time: details.startTimeEndTimes[
403+
details.days.indexOf(day)
404+
].startTime.slice(0, 255),
405+
end_time: details.startTimeEndTimes[
406+
details.days.indexOf(day)
407+
].endTime.slice(0, 255),
408+
group: details.group.slice(0, 255),
409+
week: details.week as "-" | "TP" | "TN",
410+
day: day.slice(0, 255),
411+
type: details.type.slice(0, 255),
412+
course_id:
413+
course.courseCode.slice(0, 255) +
414+
(extractLastStringInBrackets(registration.name) ??
415+
registration.name),
416+
spots_occupied: details.spotsOccupied,
417+
spots_total: details.spotsTotal,
418+
url: url.slice(0, 255),
419+
is_active: true,
420+
created_at: currentDate,
421+
updated_at: currentDate,
422+
},
423+
lecturers,
416424
};
417425
}),
418426
);
427+
428+
const uniqueRows = Array.from(
429+
new Map(
430+
preparedGroups.map(({ row, lecturers }) => [
431+
JSON.stringify([
432+
row.name,
433+
row.start_time,
434+
row.end_time,
435+
row.group,
436+
row.week,
437+
row.day,
438+
row.type,
439+
row.course_id,
440+
]),
441+
{ row, lecturers },
442+
]),
443+
).values(),
444+
);
445+
const mergedProps = Array.from(
446+
new Set(Object.keys(uniqueRows[0].row)).difference(
447+
new Set([
448+
"created_at",
449+
"name",
450+
"start_time",
451+
"end_time",
452+
"group",
453+
"week",
454+
"day",
455+
"type",
456+
"course_id",
457+
]),
458+
),
459+
);
419460
const groups = await Promise.all(
420-
chunkArray(groupQueue, QUERY_CHUNK_SIZE).map((chunk) =>
421-
this.dbSemaphore.runTask(() =>
422-
Group.updateOrCreateMany(
423-
[
424-
"name",
425-
"startTime",
426-
"endTime",
427-
"group",
428-
"week",
429-
"day",
430-
"type",
431-
"courseId",
432-
],
433-
chunk,
434-
),
435-
),
461+
chunkArray(uniqueRows, QUERY_CHUNK_SIZE).map((chunk) =>
462+
this.dbSemaphore.runTask(async () => {
463+
const ids = (await db
464+
.knexQuery()
465+
.insert(chunk.map((el) => el.row))
466+
.into("groups")
467+
.onConflict(
468+
db.knexRawQuery('ON CONSTRAINT "groups_scraper_uindex"'),
469+
)
470+
.merge(mergedProps)
471+
.returning("id")) as { id: number }[];
472+
const updatedGroups = await Group.findMany(ids.map((i) => i.id));
473+
return zip(updatedGroups, chunk).map(([group, { lecturers }]) => {
474+
return { group, lecturers };
475+
});
476+
//Group.updateOrCreateMany(
477+
// ["url", "startTime", "day", "week", "courseId"],
478+
// chunk,
479+
//),
480+
}),
436481
),
437482
).then((a) => a.flat());
438483

439484
task.update("Updating group lecturers");
440-
const groupLecturers = zip(
441-
fetchedDetails.flatMap(({ lecturers, details }) => {
442-
const ids = lecturers.map((lecturer) => {
443-
const id = lecturerMap.get(lecturer);
444-
assert(id !== undefined);
445-
return id;
446-
});
447-
return details.days.map(() => ids);
448-
}),
449-
groups,
450-
);
451-
452485
await Promise.all(
453-
groupLecturers.map(([lecturers, group]) =>
454-
this.dbSemaphore.runTask(() =>
455-
group.related("lecturers").sync(lecturers),
456-
),
486+
groups.map(({ group, lecturers }) =>
487+
this.dbSemaphore.runTask(async () => {
488+
const ids = lecturers.map((lecturer) => {
489+
const id = lecturerMap.get(lecturer);
490+
assert(id !== undefined);
491+
return id;
492+
});
493+
await group.related("lecturers").sync(ids);
494+
}),
457495
),
458496
);
459497
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import { BaseSchema } from "@adonisjs/lucid/schema";
2+
3+
export default class extends BaseSchema {
4+
protected tableName = "groups";
5+
6+
async up() {
7+
await this.db.rawQuery(`
8+
BEGIN;
9+
WITH "duplicates" AS (
10+
SELECT MAX("id") AS "id", "name", "start_time", "end_time", "group", "week", "day", "type", "course_id"
11+
FROM "groups"
12+
GROUP BY "name", "start_time", "end_time", "group", "week", "day", "type", "course_id"
13+
HAVING COUNT(*) > 1
14+
)
15+
DELETE FROM "groups"
16+
USING "duplicates"
17+
WHERE "groups"."id" <> "duplicates"."id"
18+
AND "groups"."name" = "duplicates"."name"
19+
AND "groups"."start_time" = "duplicates"."start_time"
20+
AND "groups"."end_time" = "duplicates"."end_time"
21+
AND "groups"."group" = "duplicates"."group"
22+
AND "groups"."week" = "duplicates"."week"
23+
AND "groups"."day" = "duplicates"."day"
24+
AND "groups"."type" = "duplicates"."type"
25+
AND "groups"."course_id" = "duplicates"."course_id";
26+
ALTER TABLE "groups"
27+
ADD CONSTRAINT "groups_scraper_uindex"
28+
UNIQUE ("name", "start_time", "end_time", "group", "week", "day", "type", "course_id");
29+
COMMIT;
30+
`);
31+
}
32+
33+
async down() {
34+
await this.db.rawQuery(`
35+
ALTER TABLE "groups"
36+
DROP CONSTRAINT "groups_scraper_uindex";
37+
`);
38+
}
39+
}

0 commit comments

Comments
 (0)