Skip to content

Commit a2d77f1

Browse files
committed
Improve Unicode script (#881)
Overhauled the script to extract all available revisions for each of the standards, so it is possible to link to a specific one. Now also the main URL for all Unicode standards now point to the latest live on their website.
1 parent 39ab837 commit a2d77f1

File tree

1 file changed

+131
-43
lines changed

1 file changed

+131
-43
lines changed

scripts/unicode.js

Lines changed: 131 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,31 @@ const skip = new Set([
2929
// Not in HTML
3030
25, 54,
3131
]);
32+
const MAX_CONCURRENCY = 5;
33+
const REFETCH_OLD_VERSIONS = false;
3234

33-
async.each(range(1, MAX_REPORT), (num, cb) => {
35+
async.eachLimit(range(1, MAX_REPORT), MAX_CONCURRENCY, (num, cb) => {
3436
if (skip.has(num)) {
3537
console.log('Skipping report #' + num);
3638
cb();
3739
return;
3840
}
3941

40-
const url = `https://www.unicode.org/reports/tr${num}/`;
42+
recurseStandard(num, `https://www.unicode.org/reports/tr${num}/`, null, cb);
43+
}, (err) => {
44+
if (err) {
45+
console.log('there was an error');
46+
console.error(err);
47+
return;
48+
}
49+
const output = {};
50+
for (const key of Object.keys(current).sort()) {
51+
output[key] = current[key];
52+
}
53+
helper.writeBiblio(FILENAME, output);
54+
});
55+
56+
function recurseStandard(num, url, latestId, cb) {
4157
console.log('Fetching', url, '...');
4258
request({
4359
url,
@@ -53,13 +69,7 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
5369
console.log('Parsing', url, '...');
5470
const dom = new JSDOM(body, { url });
5571
const { document } = dom.window;
56-
const type = document.title.slice(0, 3);
57-
if (type !== 'UTS' && type !== 'UTR' && type !== 'UAX') {
58-
console.log('Unable to parse title', document.title);
59-
cb();
60-
return;
61-
}
62-
const id = type + num;
72+
6373
const statusEl = document.querySelector('.body > h2');
6474
if (!statusEl) {
6575
console.log('Unable to find status');
@@ -68,6 +78,24 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
6878
}
6979
const status = trimText(statusEl.textContent);
7080

81+
let type = document.title.match(/\b(UTS|UTR|UAX)/);
82+
if (type !== 'UTS' && type !== 'UTR' && type !== 'UAX') {
83+
// Fallback for https://www.unicode.org/reports/tr35/
84+
const lowerStatus = status.toLowerCase();
85+
if (lowerStatus.indexOf('technical standard') != -1) {
86+
type = 'UTS';
87+
} else if (lowerStatus.indexOf('standard annex') != -1) {
88+
type = 'UAX';
89+
} else if (lowerStatus.indexOf('technical report') != -1) {
90+
type = 'UTR';
91+
} else {
92+
console.log('Unable to parse document type');
93+
cb();
94+
return;
95+
}
96+
}
97+
const thisId = type + num;
98+
7199
const titleEl = statusEl.nextElementSibling;
72100
if (!titleEl || titleEl.tagName !== 'H1') {
73101
console.log('Unable to find title');
@@ -86,61 +114,93 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
86114
return;
87115
}
88116

117+
if (latestId == null) {
118+
// This is first scanned document, so the latest version.
119+
latestId = thisId;
120+
121+
const authors = infoTable.Editor && parseEditor(infoTable.Editor);
122+
if (!authors) {
123+
console.log('Unable to find/parse editors in table');
124+
cb();
125+
return;
126+
}
127+
128+
current[thisId] = {
129+
href: url,
130+
authors,
131+
etAl: authors.etAl,
132+
title,
133+
status,
134+
publisher: 'Unicode Consortium',
135+
versions: current[latestId]?.versions ?? {}
136+
};
137+
} else if (thisId != latestId) {
138+
// The document was renamed at some point - create link
139+
current[thisId] = { aliasOf: latestId };
140+
}
141+
89142
const date = trimText(infoTable.Date);
90-
if (!date) {
143+
if (!date || !/\d{4}-\d{2}-\d{2}/.test(date)) {
91144
console.log('Unable to find date in table');
92145
cb();
93146
return;
94147
}
95-
let isRawDate = /\d{4}-\d{2}-\d{2}/.test(date);
96148

97-
const href = processURL(infoTable['This Version'] || url);
149+
const href = processURL(infoTable['This Version']);
150+
if (!href) {
151+
console.log('Failed to extract version URL');
152+
cb();
153+
return;
154+
}
98155

99-
const authors = infoTable.Editor && parseEditor(infoTable.Editor);
100-
if (!authors) {
101-
console.log('Unable to find/parse editors in table');
156+
const revision = parseRevision(href);
157+
if (!revision) {
158+
console.log('Failed to extract revision');
102159
cb();
103160
return;
104161
}
105162

106-
if (type !== 'UAX' && current[`UAX${num}`])
107-
current[`UAX${num}`] = { aliasOf: id };
108-
if (type !== 'UTR' && current[`UTR${num}`])
109-
current[`UTR${num}`] = { aliasOf: id };
110-
if (type !== 'UTS' && current[`UTS${num}`])
111-
current[`UTS${num}`] = { aliasOf: id };
163+
const version = parseVersion(infoTable.Version);
112164

113-
current[id] = {
114-
authors,
115-
etAl: authors.etAl,
165+
if (version)
166+
title = `${title} version ${version}`;
167+
else
168+
title = `${title} revision ${revision}`;
169+
170+
const wasAlreadyDefined = revision in current[latestId].versions;
171+
current[latestId].versions[revision] = {
116172
href,
173+
rawDate: date,
117174
title,
118-
date: isRawDate ? undefined : date,
119-
rawDate: isRawDate ? date : undefined,
120-
status,
121-
publisher: 'Unicode Consortium'
175+
status: current[latestId].status != status ? status : undefined,
122176
};
177+
178+
/*
179+
* If this revision was already defined, then don't waste time and bandwidth fetching
180+
* previous revisions which should have no changes.
181+
*
182+
* We're running this check after updating the information for this version in case this
183+
* is the latest and is a WIP, as we have already downloaded it anyway.
184+
*/
185+
if (!wasAlreadyDefined || REFETCH_OLD_VERSIONS) {
186+
const previousUrl = processURL(infoTable['Previous Version']);
187+
if (previousUrl) {
188+
recurseStandard(num, previousUrl, latestId, cb);
189+
return;
190+
}
191+
}
123192
cb();
124193
});
125-
}, (err) => {
126-
if (err) {
127-
console.log('there was an error');
128-
console.error(err);
129-
return;
130-
}
131-
const output = {};
132-
for (const key of Object.keys(current).sort()) {
133-
output[key] = current[key];
134-
}
135-
helper.writeBiblio(FILENAME, output);
136-
});
194+
}
137195

138196
function* range(from, until) {
139197
for (let i = from; i <= until; i++)
140198
yield i;
141199
}
142200

143201
function trimText(str) {
202+
if (!str)
203+
return str;
144204
return str.replace(/®/g, '').trim().replace(/\s+/g, ' ');
145205
}
146206

@@ -154,9 +214,9 @@ function gatherText(element) {
154214
if (node.nodeType === node.ELEMENT_NODE && node.tagName === 'BR')
155215
str += '\n';
156216
else
157-
str += trimText(node.textContent) + ' ';
217+
str += node.textContent;
158218
}
159-
return str;
219+
return trimText(str);
160220
}
161221

162222
function parseTable(tableEl) {
@@ -173,7 +233,16 @@ function parseTable(tableEl) {
173233
}
174234

175235
function processURL(str) {
176-
return trimText(str).replace(/^http:/, 'https:');
236+
if (!str)
237+
return null;
238+
str = trimText(str);
239+
/*
240+
* Check for "Previous Version" in https://www.unicode.org/reports/tr38/tr38-5.html and
241+
* others, where it is "n/a".
242+
*/
243+
if (str.substring(0, 4) != 'http')
244+
return null;
245+
return str.replace(/^http:/, 'https:');
177246
}
178247

179248
function parseEditor(str) {
@@ -184,3 +253,22 @@ function parseEditor(str) {
184253
}
185254
return arr;
186255
}
256+
257+
function parseRevision(url) {
258+
if (!url)
259+
return null;
260+
/*
261+
* Find a in the URL the pattern "/tr<num>/tr<num>-<revision>". This works for the two cases:
262+
* - /tr<num>/tr<num>-<rev>/tr<num>.html (only UTS #35?)
263+
* - /tr<num>/tr<num>-<rev>.html (all others)
264+
*/
265+
const match = url.match(/\/(tr\d+)\/\1-(?<rev>\d+)/, url);
266+
return match ? match.groups.rev : null;
267+
}
268+
269+
function parseVersion(str) {
270+
if (!str)
271+
return null;
272+
// Some have "Unicode 11.0.0" instead of the version alone. Strip it.
273+
return trimText(str).replace(/^Unicode\s*/, '');
274+
}

0 commit comments

Comments
 (0)