Skip to content

Commit 7e20b67

Browse files
committed
Improve Unicode script (#881)
Overhauled the script to extract all available revisions for each of the standards, so it is possible to link to a specific one. Now also the main URL for all Unicode standards now point to the latest live on their website.
1 parent 39ab837 commit 7e20b67

File tree

1 file changed

+143
-47
lines changed

1 file changed

+143
-47
lines changed

scripts/unicode.js

Lines changed: 143 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,31 @@ const skip = new Set([
2929
// Not in HTML
3030
25, 54,
3131
]);
32+
const MAX_CONCURRENCY = 10;
33+
const REFETCH_OLD_VERSIONS = false;
3234

33-
async.each(range(1, MAX_REPORT), (num, cb) => {
35+
async.eachLimit(range(1, MAX_REPORT), MAX_CONCURRENCY, (num, cb) => {
3436
if (skip.has(num)) {
3537
console.log('Skipping report #' + num);
3638
cb();
3739
return;
3840
}
3941

40-
const url = `https://www.unicode.org/reports/tr${num}/`;
42+
recurseStandard(num, `https://www.unicode.org/reports/tr${num}/`, null, cb);
43+
}, (err) => {
44+
if (err) {
45+
console.log('there was an error');
46+
console.error(err);
47+
return;
48+
}
49+
const output = {};
50+
for (const key of Object.keys(current).sort()) {
51+
output[key] = current[key];
52+
}
53+
helper.writeBiblio(FILENAME, output);
54+
});
55+
56+
function recurseStandard(num, url, latestId, cb) {
4157
console.log('Fetching', url, '...');
4258
request({
4359
url,
@@ -53,13 +69,15 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
5369
console.log('Parsing', url, '...');
5470
const dom = new JSDOM(body, { url });
5571
const { document } = dom.window;
56-
const type = document.title.slice(0, 3);
57-
if (type !== 'UTS' && type !== 'UTR' && type !== 'UAX') {
58-
console.log('Unable to parse title', document.title);
72+
73+
const infoTableEl = document.querySelector('.body > table');
74+
const infoTable = infoTableEl && parseTable(infoTableEl);
75+
if (!infoTable) {
76+
console.log('Unable to find information table');
5977
cb();
6078
return;
6179
}
62-
const id = type + num;
80+
6381
const statusEl = document.querySelector('.body > h2');
6482
if (!statusEl) {
6583
console.log('Unable to find status');
@@ -68,6 +86,24 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
6886
}
6987
const status = trimText(statusEl.textContent);
7088

89+
let type = document.title.match(/\b(UTS|UTR|UAX)/);
90+
if (type !== 'UTS' && type !== 'UTR' && type !== 'UAX') {
91+
// Fallback for https://www.unicode.org/reports/tr35/
92+
const lowerStatus = status.toLowerCase();
93+
if (lowerStatus.indexOf('technical standard') != -1) {
94+
type = 'UTS';
95+
} else if (lowerStatus.indexOf('standard annex') != -1) {
96+
type = 'UAX';
97+
} else if (lowerStatus.indexOf('technical report') != -1) {
98+
type = 'UTR';
99+
} else {
100+
console.log('Unable to parse document type');
101+
cb();
102+
return;
103+
}
104+
}
105+
const thisId = type + num;
106+
71107
const titleEl = statusEl.nextElementSibling;
72108
if (!titleEl || titleEl.tagName !== 'H1') {
73109
console.log('Unable to find title');
@@ -78,69 +114,101 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
78114
if (!/[a-z]/.test(title))
79115
title = titleCase(title);
80116

81-
const infoTableEl = document.querySelector('.body > table');
82-
const infoTable = infoTableEl && parseTable(infoTableEl);
83-
if (!infoTable) {
84-
console.log('Unable to find information table');
117+
if (latestId == null) {
118+
// This is first scanned document, so the latest version.
119+
latestId = thisId;
120+
121+
const authors = infoTable.Editor && parseEditor(infoTable.Editor);
122+
if (!authors) {
123+
console.log('Unable to find/parse editors in table');
124+
cb();
125+
return;
126+
}
127+
128+
current[thisId] = {
129+
href: url,
130+
authors,
131+
etAl: authors.etAl,
132+
title,
133+
status,
134+
publisher: 'Unicode Consortium',
135+
versions: current[latestId]?.versions ?? {}
136+
};
137+
} else if (thisId != latestId) {
138+
// The document was renamed at some point - create link
139+
current[thisId] = { aliasOf: latestId };
140+
}
141+
142+
const href = processURL(infoTable['This Version']);
143+
if (!href) {
144+
console.log('Failed to extract version URL');
85145
cb();
86146
return;
87147
}
88148

89-
const date = trimText(infoTable.Date);
90-
if (!date) {
91-
console.log('Unable to find date in table');
149+
const revision = parseRevision(href);
150+
if (!revision) {
151+
console.log('Failed to extract revision');
92152
cb();
93153
return;
94154
}
95-
let isRawDate = /\d{4}-\d{2}-\d{2}/.test(date);
96-
97-
const href = processURL(infoTable['This Version'] || url);
98155

99-
const authors = infoTable.Editor && parseEditor(infoTable.Editor);
100-
if (!authors) {
101-
console.log('Unable to find/parse editors in table');
156+
if (!infoTable.Date) {
157+
console.log('Unable to find date in table');
158+
cb();
159+
return;
160+
}
161+
/*
162+
* Replace all spaces. We cannot simply trim as https://www.unicode.org/reports/tr57/tr57-2.html
163+
* contains "2024- 07-01" due to the coloring.
164+
*/
165+
const rawDate = infoTable.Date.replace(/\s/g, '');
166+
if (!/\d{4}-\d{2}-\d{2}/.test(rawDate)) {
167+
console.log('Unable to parse data in table');
102168
cb();
103169
return;
104170
}
105171

106-
if (type !== 'UAX' && current[`UAX${num}`])
107-
current[`UAX${num}`] = { aliasOf: id };
108-
if (type !== 'UTR' && current[`UTR${num}`])
109-
current[`UTR${num}`] = { aliasOf: id };
110-
if (type !== 'UTS' && current[`UTS${num}`])
111-
current[`UTS${num}`] = { aliasOf: id };
172+
const version = parseVersion(infoTable.Version);
173+
if (version)
174+
title = `${title} version ${version}`;
175+
else
176+
title = `${title} revision ${revision}`;
112177

113-
current[id] = {
114-
authors,
115-
etAl: authors.etAl,
178+
const wasAlreadyDefined = revision in current[latestId].versions;
179+
current[latestId].versions[revision] = {
116180
href,
181+
rawDate,
117182
title,
118-
date: isRawDate ? undefined : date,
119-
rawDate: isRawDate ? date : undefined,
120-
status,
121-
publisher: 'Unicode Consortium'
183+
status: current[latestId].status != status ? status : undefined,
122184
};
185+
186+
/*
187+
* If this revision was already defined, then don't waste time and bandwidth fetching
188+
* previous revisions which should have no changes.
189+
*
190+
* We're running this check after updating the information for this version in case this
191+
* is the latest and is a WIP, as we have already downloaded it anyway.
192+
*/
193+
if (!wasAlreadyDefined || REFETCH_OLD_VERSIONS) {
194+
const previousUrl = processURL(infoTable['Previous Version']);
195+
if (previousUrl) {
196+
recurseStandard(num, previousUrl, latestId, cb);
197+
return;
198+
}
199+
}
123200
cb();
124201
});
125-
}, (err) => {
126-
if (err) {
127-
console.log('there was an error');
128-
console.error(err);
129-
return;
130-
}
131-
const output = {};
132-
for (const key of Object.keys(current).sort()) {
133-
output[key] = current[key];
134-
}
135-
helper.writeBiblio(FILENAME, output);
136-
});
202+
}
137203

138204
function* range(from, until) {
139205
for (let i = from; i <= until; i++)
140206
yield i;
141207
}
142208

143209
function trimText(str) {
210+
if (!str)
211+
return str;
144212
return str.replace(/®/g, '').trim().replace(/\s+/g, ' ');
145213
}
146214

@@ -154,9 +222,9 @@ function gatherText(element) {
154222
if (node.nodeType === node.ELEMENT_NODE && node.tagName === 'BR')
155223
str += '\n';
156224
else
157-
str += trimText(node.textContent) + ' ';
225+
str += node.textContent;
158226
}
159-
return str;
227+
return trimText(str);
160228
}
161229

162230
function parseTable(tableEl) {
@@ -173,7 +241,16 @@ function parseTable(tableEl) {
173241
}
174242

175243
function processURL(str) {
176-
return trimText(str).replace(/^http:/, 'https:');
244+
if (!str)
245+
return null;
246+
str = trimText(str);
247+
/*
248+
* Check for "Previous Version" in https://www.unicode.org/reports/tr38/tr38-5.html and
249+
* others, where it is "n/a".
250+
*/
251+
if (str.substring(0, 4) != 'http')
252+
return null;
253+
return str.replace(/^http:/, 'https:');
177254
}
178255

179256
function parseEditor(str) {
@@ -184,3 +261,22 @@ function parseEditor(str) {
184261
}
185262
return arr;
186263
}
264+
265+
function parseRevision(url) {
266+
if (!url)
267+
return null;
268+
/*
269+
* Find a in the URL the pattern "/tr<num>/tr<num>-<revision>". This works for the two cases:
270+
* - /tr<num>/tr<num>-<rev>/tr<num>.html (only UTS #35?)
271+
* - /tr<num>/tr<num>-<rev>.html (all others)
272+
*/
273+
const match = url.match(/\/(tr\d+)\/\1-(?<rev>\d+)/, url);
274+
return match ? match.groups.rev : null;
275+
}
276+
277+
function parseVersion(str) {
278+
if (!str)
279+
return null;
280+
// Some have "Unicode 11.0.0" instead of the version alone. Strip it.
281+
return trimText(str).replace(/^Unicode\s*/, '');
282+
}

0 commit comments

Comments
 (0)