Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 144 additions & 44 deletions scripts/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,31 @@ const skip = new Set([
// Not in HTML
25, 54,
]);
const MAX_CONCURRENCY = 5;
const REFETCH_OLD_VERSIONS = false;

async.each(range(1, MAX_REPORT), (num, cb) => {
async.eachLimit(range(1, MAX_REPORT), MAX_CONCURRENCY, (num, cb) => {
if (skip.has(num)) {
console.log('Skipping report #' + num);
cb();
return;
}

const url = `https://www.unicode.org/reports/tr${num}/`;
recurseStandard(num, `https://www.unicode.org/reports/tr${num}/`, null, cb);
}, (err) => {
if (err) {
console.log('there was an error');
console.error(err);
return;
}
const output = {};
for (const key of Object.keys(current).sort()) {
output[key] = current[key];
}
helper.writeBiblio(FILENAME, output);
});

function recurseStandard(num, url, latestId, cb) {
console.log('Fetching', url, '...');
request({
url,
Expand All @@ -53,13 +69,7 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
console.log('Parsing', url, '...');
const dom = new JSDOM(body, { url });
const { document } = dom.window;
const type = document.title.slice(0, 3);
if (type !== 'UTS' && type !== 'UTR' && type !== 'UAX') {
console.log('Unable to parse title', document.title);
cb();
return;
}
const id = type + num;

const statusEl = document.querySelector('.body > h2');
if (!statusEl) {
console.log('Unable to find status');
Expand All @@ -68,6 +78,24 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
}
const status = trimText(statusEl.textContent);

let type = document.title.match(/\b(UTS|UTR|UAX)/);
if (type !== 'UTS' && type !== 'UTR' && type !== 'UAX') {
// Fallback for https://www.unicode.org/reports/tr35/
const lowerStatus = status.toLowerCase();
if (lowerStatus.indexOf('technical standard') != -1) {
type = 'UTS';
} else if (lowerStatus.indexOf('standard annex') != -1) {
type = 'UAX';
} else if (lowerStatus.indexOf('technical report') != -1) {
type = 'UTR';
} else {
console.log('Unable to parse document type');
cb();
return;
}
}
const thisId = type + num;

const titleEl = statusEl.nextElementSibling;
if (!titleEl || titleEl.tagName !== 'H1') {
console.log('Unable to find title');
Expand All @@ -86,62 +114,106 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
return;
}

if (latestId == null) {
// This is first scanned document, so the latest version.
latestId = thisId;

const authors = infoTable.Editor && parseEditor(infoTable.Editor);
if (!authors) {
console.log('Unable to find/parse editors in table');
cb();
return;
}

current[thisId] = {
href: url,
authors,
etAl: authors.etAl,
title,
status,
publisher: 'Unicode Consortium',
versions: current[latestId]?.versions ?? {}
};
} else if (thisId != latestId) {
// The document was renamed at some point - create link
current[thisId] = { aliasOf: latestId };
}

const date = trimText(infoTable.Date);
if (!date) {
if (!date || !/\d{4}-\d{2}-\d{2}/.test(date)) {
console.log('Unable to find date in table');
cb();
return;
}
let isRawDate = /\d{4}-\d{2}-\d{2}/.test(date);

const href = processURL(infoTable['This Version'] || url);
const href = processURL(infoTable['This Version']);
if (!href) {
console.log('Failed to extract version URL');
cb();
return;
}

const authors = infoTable.Editor && parseEditor(infoTable.Editor);
if (!authors) {
console.log('Unable to find/parse editors in table');
const revision = parseRevision(href);
if (!revision) {
console.log('Failed to extract revision');
cb();
return;
}

if (type !== 'UAX' && current[`UAX${num}`])
current[`UAX${num}`] = { aliasOf: id };
if (type !== 'UTR' && current[`UTR${num}`])
current[`UTR${num}`] = { aliasOf: id };
if (type !== 'UTS' && current[`UTS${num}`])
current[`UTS${num}`] = { aliasOf: id };
const version = parseVersion(infoTable.Version);

if (version)
title = `${title} version ${version}`;
else
title = `${title} revision ${revision}`;

current[id] = {
authors,
etAl: authors.etAl,
const wasAlreadyDefined = revision in current[latestId].versions;
current[latestId].versions[revision] = {
href,
rawDate: date,
title,
date: isRawDate ? undefined : date,
rawDate: isRawDate ? date : undefined,
status,
publisher: 'Unicode Consortium'
status: current[latestId].status != status ? status : undefined,
};

/*
* If this revision was already defined, then don't waste time and bandwidth fetching
* previous revisions which should have no changes.
*
* We're running this check after updating the information for this version in case this
* is the latest and is a WIP, as we have already downloaded it anyway.
*/
if (!wasAlreadyDefined || REFETCH_OLD_VERSIONS) {
const previousUrl = processURL(infoTable['Previous Version']);
if (previousUrl) {
recurseStandard(num, previousUrl, latestId, cb);
return;
}
}
cb();
});
}, (err) => {
if (err) {
console.log('there was an error');
console.error(err);
return;
}
const output = {};
for (const key of Object.keys(current).sort()) {
output[key] = current[key];
}
helper.writeBiblio(FILENAME, output);
});
}

function* range(from, until) {
for (let i = from; i <= until; i++)
yield i;
}

function trimText(str) {
return str.replace(/®/g, '').trim().replace(/\s+/g, ' ');
if (!str)
return str;
str = str.replace(/®/g, '').trim();

/*
* Replace consecutive newlines (with any surrounding spaces) with a single newline.
* Technically the first [\s--\n]* could be simply \s* but writing it this way avoids
* heavy backtracking for long stretches of spaces.
*/
str = str.replace(/[\s--\n]*\n\s*/gv, '\n');

// Now replace all other spans of spaces, excluding new lines, with a single space
str = str.replace(/[\s--\n]+/gv, ' ');

return str;
}

function titleCase(str) {
Expand All @@ -154,9 +226,9 @@ function gatherText(element) {
if (node.nodeType === node.ELEMENT_NODE && node.tagName === 'BR')
str += '\n';
else
str += trimText(node.textContent) + ' ';
str += node.textContent;
}
return str;
return trimText(str);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling trimText here introduces a side effect: a few pages (e.g., UTS37) separate authors using lines, and trimText replaces all white spaces and line terminators with a single white space. The parseEditor function is then unable to split authors as \n no longer matches anything.

Copy link
Author

@socram8888 socram8888 Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. The reason for the change is that some documents have inline coloring via <span>s.

For example TR35-3, it has a version set to:

1<span>.</span><span class="changed">2 (draft 5)</span>

The original version caused it to become:

1 . 2 (draft 5)

I will think of a better implementation that works for all cases.

}

function parseTable(tableEl) {
Expand All @@ -173,7 +245,16 @@ function parseTable(tableEl) {
}

function processURL(str) {
return trimText(str).replace(/^http:/, 'https:');
if (!str)
return null;
str = trimText(str);
/*
* Check for "Previous Version" in https://www.unicode.org/reports/tr38/tr38-5.html and
* others, where it is "n/a".
*/
if (str.substring(0, 4) != 'http')
return null;
return str.replace(/^http:/, 'https:');
}

function parseEditor(str) {
Expand All @@ -184,3 +265,22 @@ function parseEditor(str) {
}
return arr;
}

function parseRevision(url) {
if (!url)
return null;
/*
* Find a in the URL the pattern "/tr<num>/tr<num>-<revision>". This works for the two cases:
* - /tr<num>/tr<num>-<rev>/tr<num>.html (only UTS #35?)
* - /tr<num>/tr<num>-<rev>.html (all others)
*/
const match = url.match(/\/(tr\d+)\/\1-(?<rev>\d+)/, url);
return match ? match.groups.rev : null;
}

function parseVersion(str) {
if (!str)
return null;
// Some have "Unicode 11.0.0" instead of the version alone. Strip it.
return trimText(str).replace(/^Unicode\s*/, '');
}