Skip to content

Commit 48863a1

Browse files
authored
Decode HTML entities in module description (#3139)
* yarn add he * add decode function * Use decode function * Add test * test less verbose * check null or undefined * remove compatible operator in package.json
1 parent 77729cc commit 48863a1

File tree

6 files changed

+49
-1
lines changed

6 files changed

+49
-1
lines changed

scrapers/nus-v2/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"@types/bunyan": "1.8.6",
2121
"@types/deep-diff": "1.0.0",
2222
"@types/fs-extra": "9.0.6",
23+
"@types/he": "1.1.1",
2324
"@types/lodash": "4.14.167",
2425
"@types/mock-fs": "4.13.0",
2526
"@types/node": "14.14.19",
@@ -53,6 +54,7 @@
5354
"date-fns": "2.16.1",
5455
"deep-diff": "1.0.2",
5556
"fs-extra": "9.0.1",
57+
"he": "1.2.0",
5658
"joi": "17.3.0",
5759
"lodash": "4.17.20",
5860
"nusmoderator": "3.0.0",

scrapers/nus-v2/src/tasks/GetSemesterData.test.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,23 @@ describe(cleanModuleInfo, () => {
151151
moduleCode: 'EL5102',
152152
});
153153
});
154+
155+
test('should decode HTML entities in description', () => {
156+
expect(
157+
cleanModuleInfo({
158+
acadYear: '2020/2021',
159+
description: 'These concepts pertain to the structure of "ultimate reality"...',
160+
title: 'Metaphysics',
161+
department: 'Philosophy',
162+
faculty: 'Arts and Social Science',
163+
moduleCredit: '4',
164+
moduleCode: 'PH2213',
165+
}),
166+
).toHaveProperty(
167+
'description',
168+
'These concepts pertain to the structure of "ultimate reality"...',
169+
);
170+
});
154171
});
155172

156173
describe(parseWorkload, () => {

scrapers/nus-v2/src/tasks/GetSemesterData.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import GetSemesterTimetable from './GetSemesterTimetable';
2121
import GetSemesterModules from './GetSemesterModules';
2222
import { fromTermCode } from '../utils/api';
2323
import { validateSemester } from '../services/validation';
24-
import { removeEmptyValues, titleize, trimValues } from '../utils/data';
24+
import { removeEmptyValues, titleize, trimValues, decodeHTMLEntities } from '../utils/data';
2525
import { difference } from '../utils/set';
2626
import { Logger } from '../services/logger';
2727

@@ -84,6 +84,7 @@ export function mapAttributes(
8484
* - Remove empty fields and fields with text like 'nil'
8585
* - Trim whitespace from module title, description and other text fields
8686
* - Properly capitalize ALL CAPS title
87+
* - Decode HTML entities in description such as '&224;' to 'à'
8788
*/
8889
export function cleanModuleInfo(module: SemesterModule) {
8990
let cleanedModule = module;
@@ -93,6 +94,10 @@ export function cleanModuleInfo(module: SemesterModule) {
9394
cleanedModule.title = titleize(cleanedModule.title);
9495
}
9596

97+
if (cleanedModule.description != null) {
98+
cleanedModule.description = decodeHTMLEntities(cleanedModule.description);
99+
}
100+
96101
// Remove empty values like 'nil' and empty strings for keys that allow them
97102
// to be nullable
98103
cleanedModule = removeEmptyValues(cleanedModule, [

scrapers/nus-v2/src/utils/data.test.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {
99
removeEmptyValues,
1010
titleize,
1111
trimValues,
12+
decodeHTMLEntities,
1213
ZWSP,
1314
} from './data';
1415

@@ -35,6 +36,14 @@ describe(titleize, () => {
3536
});
3637
});
3738

39+
describe(decodeHTMLEntities, () => {
40+
test('should decode HTML entities', () => {
41+
expect(
42+
decodeHTMLEntities('& Schrödinger cried, "Oh là là!"'),
43+
).toEqual('& Schrödinger cried, "Oh là là!"');
44+
});
45+
});
46+
3847
describe(trimValues, () => {
3948
test('should remove whitespace around the given values', () => {
4049
expect(

scrapers/nus-v2/src/utils/data.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*/
44

55
import { uniq, trim, groupBy, values } from 'lodash';
6+
import { decode } from 'he';
67
import { VenueLesson } from '../types/venues';
78
import {
89
DayText,
@@ -46,6 +47,10 @@ export function titleize(string: string) {
4647
return capitalized;
4748
}
4849

50+
export function decodeHTMLEntities(string: string) {
51+
return decode(string);
52+
}
53+
4954
/**
5055
* Remove keys with empty values, null or strings like 'nil', 'none'
5156
* Mutates the input object

scrapers/nus-v2/yarn.lock

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,11 @@
683683
dependencies:
684684
"@types/node" "*"
685685

686+
687+
version "1.1.1"
688+
resolved "https://registry.yarnpkg.com/@types/he/-/he-1.1.1.tgz#19e14033c4ee8f1a702c74dcc6182664839ac2b7"
689+
integrity sha512-jpzrsR1ns0n3kyWt92QfOUQhIuJGQ9+QGa7M62rO6toe98woQjnsnzjdMtsQXCdvjjmqjS2ZBCC7xKw0cdzU+Q==
690+
686691
"@types/istanbul-lib-coverage@*", "@types/istanbul-lib-coverage@^2.0.0", "@types/istanbul-lib-coverage@^2.0.1":
687692
version "2.0.3"
688693
resolved "https://registry.yarnpkg.com/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.3.tgz#4ba8ddb720221f432e443bd5f9117fd22cfd4762"
@@ -2333,6 +2338,11 @@ has@^1.0.3:
23332338
dependencies:
23342339
function-bind "^1.1.1"
23352340

2341+
2342+
version "1.2.0"
2343+
resolved "https://registry.yarnpkg.com/he/-/he-1.2.0.tgz#84ae65fa7eafb165fddb61566ae14baf05664f0f"
2344+
integrity sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==
2345+
23362346
hosted-git-info@^2.1.4:
23372347
version "2.8.8"
23382348
resolved "https://registry.yarnpkg.com/hosted-git-info/-/hosted-git-info-2.8.8.tgz#7539bd4bc1e0e0a895815a2e0262420b12858488"

0 commit comments

Comments
 (0)