Skip to content

Commit 4185bc2

Browse files
committed
Improve Unicode script (#881)
Overhauled the script to extract all available revisions for each of the standards, so it is possible to link to a specific one. Now also the main URL for all Unicode standards now point to the latest live on their website.
1 parent 39ab837 commit 4185bc2

File tree

1 file changed

+140
-44
lines changed

1 file changed

+140
-44
lines changed

scripts/unicode.js

Lines changed: 140 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,31 @@ const skip = new Set([
2929
// Not in HTML
3030
25, 54,
3131
]);
32+
const MAX_CONCURRENCY = 5;
33+
const REFETCH_OLD_VERSIONS = false;
3234

33-
async.each(range(1, MAX_REPORT), (num, cb) => {
35+
async.eachLimit(range(1, MAX_REPORT), MAX_CONCURRENCY, (num, cb) => {
3436
if (skip.has(num)) {
3537
console.log('Skipping report #' + num);
3638
cb();
3739
return;
3840
}
3941

40-
const url = `https://www.unicode.org/reports/tr${num}/`;
42+
recurseStandard(num, `https://www.unicode.org/reports/tr${num}/`, null, cb);
43+
}, (err) => {
44+
if (err) {
45+
console.log('there was an error');
46+
console.error(err);
47+
return;
48+
}
49+
const output = {};
50+
for (const key of Object.keys(current).sort()) {
51+
output[key] = current[key];
52+
}
53+
helper.writeBiblio(FILENAME, output);
54+
});
55+
56+
function recurseStandard(num, url, latestId, cb) {
4157
console.log('Fetching', url, '...');
4258
request({
4359
url,
@@ -53,13 +69,7 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
5369
console.log('Parsing', url, '...');
5470
const dom = new JSDOM(body, { url });
5571
const { document } = dom.window;
56-
const type = document.title.slice(0, 3);
57-
if (type !== 'UTS' && type !== 'UTR' && type !== 'UAX') {
58-
console.log('Unable to parse title', document.title);
59-
cb();
60-
return;
61-
}
62-
const id = type + num;
72+
6373
const statusEl = document.querySelector('.body > h2');
6474
if (!statusEl) {
6575
console.log('Unable to find status');
@@ -68,6 +78,24 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
6878
}
6979
const status = trimText(statusEl.textContent);
7080

81+
let type = document.title.match(/\b(UTS|UTR|UAX)/);
82+
if (type !== 'UTS' && type !== 'UTR' && type !== 'UAX') {
83+
// Fallback for https://www.unicode.org/reports/tr35/
84+
const lowerStatus = status.toLowerCase();
85+
if (lowerStatus.indexOf('technical standard') != -1) {
86+
type = 'UTS';
87+
} else if (lowerStatus.indexOf('standard annex') != -1) {
88+
type = 'UAX';
89+
} else if (lowerStatus.indexOf('technical report') != -1) {
90+
type = 'UTR';
91+
} else {
92+
console.log('Unable to parse document type');
93+
cb();
94+
return;
95+
}
96+
}
97+
const thisId = type + num;
98+
7199
const titleEl = statusEl.nextElementSibling;
72100
if (!titleEl || titleEl.tagName !== 'H1') {
73101
console.log('Unable to find title');
@@ -86,62 +114,102 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
86114
return;
87115
}
88116

117+
if (latestId == null) {
118+
// This is first scanned document, so the latest version.
119+
latestId = thisId;
120+
121+
const authors = infoTable.Editor && parseEditor(infoTable.Editor);
122+
if (!authors) {
123+
console.log('Unable to find/parse editors in table');
124+
cb();
125+
return;
126+
}
127+
128+
current[thisId] = {
129+
href: url,
130+
authors,
131+
etAl: authors.etAl,
132+
title,
133+
status,
134+
publisher: 'Unicode Consortium',
135+
versions: current[latestId]?.versions ?? {}
136+
};
137+
} else if (thisId != latestId) {
138+
// The document was renamed at some point - create link
139+
current[thisId] = { aliasOf: latestId };
140+
}
141+
89142
const date = trimText(infoTable.Date);
90-
if (!date) {
143+
if (!date || !/\d{4}-\d{2}-\d{2}/.test(date)) {
91144
console.log('Unable to find date in table');
92145
cb();
93146
return;
94147
}
95-
let isRawDate = /\d{4}-\d{2}-\d{2}/.test(date);
96148

97-
const href = processURL(infoTable['This Version'] || url);
149+
const href = processURL(infoTable['This Version']);
150+
if (!href) {
151+
console.log('Failed to extract version URL');
152+
cb();
153+
return;
154+
}
98155

99-
const authors = infoTable.Editor && parseEditor(infoTable.Editor);
100-
if (!authors) {
101-
console.log('Unable to find/parse editors in table');
156+
const revision = parseRevision(href);
157+
if (!revision) {
158+
console.log('Failed to extract revision');
102159
cb();
103160
return;
104161
}
105162

106-
if (type !== 'UAX' && current[`UAX${num}`])
107-
current[`UAX${num}`] = { aliasOf: id };
108-
if (type !== 'UTR' && current[`UTR${num}`])
109-
current[`UTR${num}`] = { aliasOf: id };
110-
if (type !== 'UTS' && current[`UTS${num}`])
111-
current[`UTS${num}`] = { aliasOf: id };
163+
const version = parseVersion(infoTable.Version);
164+
165+
if (version)
166+
title = `${title} version ${version}`;
167+
else
168+
title = `${title} revision ${revision}`;
112169

113-
current[id] = {
114-
authors,
115-
etAl: authors.etAl,
170+
const wasAlreadyDefined = revision in current[latestId].versions;
171+
current[latestId].versions[revision] = {
116172
href,
173+
rawDate: date,
117174
title,
118-
date: isRawDate ? undefined : date,
119-
rawDate: isRawDate ? date : undefined,
120-
status,
121-
publisher: 'Unicode Consortium'
175+
status: current[latestId].status != status ? status : undefined,
122176
};
177+
178+
/*
179+
* If this revision was already defined, then don't waste time and bandwidth fetching
180+
* previous revisions which should have no changes.
181+
*
182+
* We're running this check after updating the information for this version in case this
183+
* is the latest and is a WIP, as we have already downloaded it anyway.
184+
*/
185+
if (!wasAlreadyDefined || REFETCH_OLD_VERSIONS) {
186+
const previousUrl = processURL(infoTable['Previous Version']);
187+
if (previousUrl) {
188+
recurseStandard(num, previousUrl, latestId, cb);
189+
return;
190+
}
191+
}
123192
cb();
124193
});
125-
}, (err) => {
126-
if (err) {
127-
console.log('there was an error');
128-
console.error(err);
129-
return;
130-
}
131-
const output = {};
132-
for (const key of Object.keys(current).sort()) {
133-
output[key] = current[key];
134-
}
135-
helper.writeBiblio(FILENAME, output);
136-
});
194+
}
137195

138196
function* range(from, until) {
139197
for (let i = from; i <= until; i++)
140198
yield i;
141199
}
142200

143201
function trimText(str) {
144-
return str.replace(/®/g, '').trim().replace(/\s+/g, ' ');
202+
if (!str)
203+
return str;
204+
str = str.replace(/®/g, '').trim();
205+
206+
// Replace consecutive newlines (with any surrounding spaces) with a single newline
207+
str = str.replace(/[\s--\n]*(\n+[\s--\n]*)+/gv, '\n');
208+
209+
// Now replace all other spans of spaces, excluding new lines, with a single space
210+
str = str.replace(/[\s--\n]+/gv, ' ');
211+
212+
return str;
145213
}
146214

147215
function titleCase(str) {
@@ -154,9 +222,9 @@ function gatherText(element) {
154222
if (node.nodeType === node.ELEMENT_NODE && node.tagName === 'BR')
155223
str += '\n';
156224
else
157-
str += trimText(node.textContent) + ' ';
225+
str += node.textContent;
158226
}
159-
return str;
227+
return trimText(str);
160228
}
161229

162230
function parseTable(tableEl) {
@@ -173,7 +241,16 @@ function parseTable(tableEl) {
173241
}
174242

175243
function processURL(str) {
176-
return trimText(str).replace(/^http:/, 'https:');
244+
if (!str)
245+
return null;
246+
str = trimText(str);
247+
/*
248+
* Check for "Previous Version" in https://www.unicode.org/reports/tr38/tr38-5.html and
249+
* others, where it is "n/a".
250+
*/
251+
if (str.substring(0, 4) != 'http')
252+
return null;
253+
return str.replace(/^http:/, 'https:');
177254
}
178255

179256
function parseEditor(str) {
@@ -184,3 +261,22 @@ function parseEditor(str) {
184261
}
185262
return arr;
186263
}
264+
265+
function parseRevision(url) {
266+
if (!url)
267+
return null;
268+
/*
269+
* Find a in the URL the pattern "/tr<num>/tr<num>-<revision>". This works for the two cases:
270+
* - /tr<num>/tr<num>-<rev>/tr<num>.html (only UTS #35?)
271+
* - /tr<num>/tr<num>-<rev>.html (all others)
272+
*/
273+
const match = url.match(/\/(tr\d+)\/\1-(?<rev>\d+)/, url);
274+
return match ? match.groups.rev : null;
275+
}
276+
277+
function parseVersion(str) {
278+
if (!str)
279+
return null;
280+
// Some have "Unicode 11.0.0" instead of the version alone. Strip it.
281+
return trimText(str).replace(/^Unicode\s*/, '');
282+
}

0 commit comments

Comments
 (0)